1use super::{ParseError, ParseOptions, ParseResult, ParseWarning};
6use std::io::{Read, Seek, SeekFrom};
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Boolean(bool),
13
14 Integer(i64),
16
17 Real(f64),
19
20 String(Vec<u8>),
22
23 Name(String),
25
26 ArrayStart,
28
29 ArrayEnd,
31
32 DictStart,
34
35 DictEnd,
37
38 Stream,
40
41 EndStream,
43
44 Obj,
46
47 EndObj,
49
50 StartXRef,
52
53 Reference(u32, u16),
55
56 Null,
58
59 Comment(String),
61
62 Eof,
64}
65
66pub struct Lexer<R> {
68 reader: std::io::BufReader<R>,
69 #[allow(dead_code)]
70 buffer: Vec<u8>,
71 position: usize,
72 peek_buffer: Option<u8>,
73 token_buffer: Vec<Token>,
74 options: ParseOptions,
75 warnings: Vec<ParseWarning>,
76}
77
78impl<R: Read> Lexer<R> {
79 pub fn new(reader: R) -> Self {
81 Self::new_with_options(reader, ParseOptions::default())
82 }
83
84 pub fn new_with_options(reader: R, options: ParseOptions) -> Self {
86 Self {
87 reader: std::io::BufReader::new(reader),
88 buffer: Vec::with_capacity(1024),
89 position: 0,
90 peek_buffer: None,
91 token_buffer: Vec::new(),
92 options,
93 warnings: Vec::new(),
94 }
95 }
96
97 pub fn warnings(&self) -> &[ParseWarning] {
99 &self.warnings
100 }
101
102 pub fn next_token(&mut self) -> ParseResult<Token> {
104 if let Some(token) = self.token_buffer.pop() {
106 return Ok(token);
107 }
108
109 self.skip_whitespace()?;
110
111 let ch = match self.peek_char()? {
112 Some(ch) => ch,
113 None => return Ok(Token::Eof),
114 };
115
116 match ch {
117 b'%' => self.read_comment(),
118 b'/' => self.read_name(),
119 b'(' => self.read_literal_string(),
120 b'<' => self.read_angle_bracket(),
121 b'>' => {
122 self.consume_char()?;
123 if self.peek_char()? == Some(b'>') {
124 self.consume_char()?;
125 Ok(Token::DictEnd)
126 } else {
127 Err(ParseError::SyntaxError {
128 position: self.position,
129 message: "Expected '>' after '>'".to_string(),
130 })
131 }
132 }
133 b'[' => {
134 self.consume_char()?;
135 Ok(Token::ArrayStart)
136 }
137 b']' => {
138 self.consume_char()?;
139 Ok(Token::ArrayEnd)
140 }
141 b't' | b'f' => self.read_boolean(),
142 b'n' => self.read_null(),
143 b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
144 b'R' => {
145 self.consume_char()?;
147 Ok(Token::Name("R".to_string()))
148 }
149 _ if ch.is_ascii_alphabetic() => self.read_keyword(),
150 _ => {
151 if self.is_problematic_encoding_char(ch) {
153 self.handle_encoding_char_in_token_stream(ch)
154 } else {
155 Err(ParseError::SyntaxError {
156 position: self.position,
157 message: format!("Unexpected character: {}", ch as char),
158 })
159 }
160 }
161 }
162 }
163
164 fn peek_char(&mut self) -> ParseResult<Option<u8>> {
166 if let Some(ch) = self.peek_buffer {
167 return Ok(Some(ch));
168 }
169
170 let mut buf = [0u8; 1];
171 match self.reader.read_exact(&mut buf) {
172 Ok(_) => {
173 self.peek_buffer = Some(buf[0]);
174 Ok(Some(buf[0]))
175 }
176 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
177 Err(e) => Err(e.into()),
178 }
179 }
180
181 fn consume_char(&mut self) -> ParseResult<Option<u8>> {
183 let ch = self.peek_char()?;
184 if ch.is_some() {
185 self.peek_buffer = None;
186 self.position += 1;
187 }
188 Ok(ch)
189 }
190
191 pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
193 let mut count = 0;
194 while let Some(ch) = self.peek_char()? {
195 if ch.is_ascii_whitespace() {
196 self.consume_char()?;
197 count += 1;
198 } else {
199 break;
200 }
201 }
202 Ok(count)
203 }
204
205 fn read_comment(&mut self) -> ParseResult<Token> {
207 self.consume_char()?; let mut comment = String::new();
209
210 while let Some(ch) = self.peek_char()? {
211 if ch == b'\n' || ch == b'\r' {
212 break;
213 }
214 self.consume_char()?;
215 comment.push(ch as char);
216 }
217
218 Ok(Token::Comment(comment))
219 }
220
221 fn read_name(&mut self) -> ParseResult<Token> {
223 self.consume_char()?; let mut name = String::new();
225
226 while let Some(ch) = self.peek_char()? {
227 if ch.is_ascii_whitespace()
228 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
229 {
230 break;
231 }
232 self.consume_char()?;
233
234 if ch == b'#' {
236 let hex1 = self
237 .consume_char()?
238 .ok_or_else(|| ParseError::SyntaxError {
239 position: self.position,
240 message: "Incomplete hex code in name".to_string(),
241 })?;
242 let hex2 = self
243 .consume_char()?
244 .ok_or_else(|| ParseError::SyntaxError {
245 position: self.position,
246 message: "Incomplete hex code in name".to_string(),
247 })?;
248
249 let value = u8::from_str_radix(&format!("{}{}", hex1 as char, hex2 as char), 16)
250 .map_err(|_| ParseError::SyntaxError {
251 position: self.position,
252 message: "Invalid hex code in name".to_string(),
253 })?;
254
255 name.push(value as char);
256 } else {
257 name.push(ch as char);
258 }
259 }
260
261 Ok(Token::Name(name))
262 }
263
264 fn read_literal_string(&mut self) -> ParseResult<Token> {
266 self.consume_char()?; let mut string = Vec::new();
268 let mut paren_depth = 1;
269 let mut escape = false;
270
271 while paren_depth > 0 {
272 let ch = match self.consume_char()? {
273 Some(c) => c,
274 None => {
275 if self.options.lenient_syntax {
276 if self.options.collect_warnings {
278 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
279 position: self.position,
280 expected: "closing parenthesis".to_string(),
281 found: "EOF".to_string(),
282 recovery_action: "returned partial string content".to_string(),
283 });
284 }
285 break;
286 } else {
287 return Err(ParseError::SyntaxError {
288 position: self.position,
289 message: "Unterminated string".to_string(),
290 });
291 }
292 }
293 };
294
295 if escape {
296 let escaped = match ch {
297 b'n' => b'\n',
298 b'r' => b'\r',
299 b't' => b'\t',
300 b'b' => b'\x08',
301 b'f' => b'\x0C',
302 b'(' => b'(',
303 b')' => b')',
304 b'\\' => b'\\',
305 b'0'..=b'7' => {
306 let mut value = ch - b'0';
308 for _ in 0..2 {
309 if let Some(next) = self.peek_char()? {
310 if matches!(next, b'0'..=b'7') {
311 self.consume_char()?;
312 value = value * 8 + (next - b'0');
313 } else {
314 break;
315 }
316 }
317 }
318 value
319 }
320 _ => ch, };
322 string.push(escaped);
323 escape = false;
324 } else {
325 match ch {
326 b'\\' => escape = true,
327 b'(' => {
328 string.push(ch);
329 paren_depth += 1;
330 }
331 b')' => {
332 paren_depth -= 1;
333 if paren_depth > 0 {
334 string.push(ch);
335 }
336 }
337 _ => string.push(ch),
338 }
339 }
340 }
341
342 let processed_string = if self.options.lenient_encoding {
344 self.process_string_with_encoding_recovery(&string)?
345 } else {
346 string
347 };
348
349 Ok(Token::String(processed_string))
350 }
351
352 fn read_angle_bracket(&mut self) -> ParseResult<Token> {
354 self.consume_char()?; if self.peek_char()? == Some(b'<') {
357 self.consume_char()?;
358 Ok(Token::DictStart)
359 } else {
360 let mut hex_chars = String::new();
362 let mut found_end = false;
363
364 while let Some(ch) = self.peek_char()? {
365 if ch == b'>' {
366 self.consume_char()?;
367 found_end = true;
368 break;
369 }
370 self.consume_char()?;
371 if ch.is_ascii_hexdigit() {
372 hex_chars.push(ch as char);
373 } else if !ch.is_ascii_whitespace() {
374 if self.options.lenient_syntax {
375 if self.options.collect_warnings {
377 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
378 position: self.position,
379 expected: "hex digit".to_string(),
380 found: format!("'{}'", ch as char),
381 recovery_action: "skipped invalid character".to_string(),
382 });
383 }
384 } else {
385 return Err(ParseError::SyntaxError {
386 position: self.position,
387 message: "Invalid character in hex string".to_string(),
388 });
389 }
390 }
391 }
392
393 if !found_end {
394 if self.options.lenient_syntax {
395 if self.options.collect_warnings {
397 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
398 position: self.position,
399 expected: ">".to_string(),
400 found: "EOF".to_string(),
401 recovery_action: "returned partial hex string".to_string(),
402 });
403 }
404 } else {
405 return Err(ParseError::SyntaxError {
406 position: self.position,
407 message: "Unterminated hex string".to_string(),
408 });
409 }
410 }
411
412 if hex_chars.len() % 2 != 0 {
414 hex_chars.push('0');
415 }
416
417 let mut bytes = Vec::new();
419 for chunk in hex_chars.as_bytes().chunks(2) {
420 let hex_str = std::str::from_utf8(chunk).map_err(|_| ParseError::SyntaxError {
421 position: self.position,
422 message: "Invalid UTF-8 in hex string".to_string(),
423 })?;
424 let byte =
425 u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
426 position: self.position,
427 message: "Invalid hex string".to_string(),
428 })?;
429 bytes.push(byte);
430 }
431
432 Ok(Token::String(bytes))
433 }
434 }
435
436 fn read_boolean(&mut self) -> ParseResult<Token> {
438 let word = self.read_word()?;
439 match word.as_str() {
440 "true" => Ok(Token::Boolean(true)),
441 "false" => Ok(Token::Boolean(false)),
442 _ => {
443 self.process_keyword(word)
445 }
446 }
447 }
448
449 fn read_null(&mut self) -> ParseResult<Token> {
451 let word = self.read_word()?;
452 if word == "null" {
453 Ok(Token::Null)
454 } else {
455 self.process_keyword(word)
457 }
458 }
459
460 fn read_number(&mut self) -> ParseResult<Token> {
462 let mut number_str = String::new();
463 let mut has_dot = false;
464
465 if let Some(ch) = self.peek_char()? {
467 if ch == b'+' || ch == b'-' {
468 self.consume_char()?;
469 number_str.push(ch as char);
470
471 if let Some(next) = self.peek_char()? {
473 if !next.is_ascii_digit() && next != b'.' {
474 return Err(ParseError::SyntaxError {
475 position: self.position,
476 message: "Expected digit after sign".to_string(),
477 });
478 }
479 }
480 }
481 }
482
483 while let Some(ch) = self.peek_char()? {
485 match ch {
486 b'0'..=b'9' => {
487 self.consume_char()?;
488 number_str.push(ch as char);
489 }
490 b'.' if !has_dot => {
491 self.consume_char()?;
492 number_str.push(ch as char);
493 has_dot = true;
494 }
495 _ => break,
496 }
497 }
498
499 if let Some(ch) = self.peek_char()? {
501 if ch == b'e' || ch == b'E' {
502 self.consume_char()?;
503 number_str.push(ch as char);
504
505 if let Some(sign_ch) = self.peek_char()? {
507 if sign_ch == b'+' || sign_ch == b'-' {
508 self.consume_char()?;
509 number_str.push(sign_ch as char);
510 }
511 }
512
513 while let Some(digit_ch) = self.peek_char()? {
515 if digit_ch.is_ascii_digit() {
516 self.consume_char()?;
517 number_str.push(digit_ch as char);
518 } else {
519 break;
520 }
521 }
522
523 has_dot = true;
525 }
526 }
527
528 if has_dot {
533 let value = number_str
534 .parse::<f64>()
535 .map_err(|_| ParseError::SyntaxError {
536 position: self.position,
537 message: format!("Invalid real number: '{number_str}'"),
538 })?;
539 Ok(Token::Real(value))
540 } else {
541 let value = number_str
542 .parse::<i64>()
543 .map_err(|_| ParseError::SyntaxError {
544 position: self.position,
545 message: format!("Invalid integer: '{number_str}'"),
546 })?;
547 Ok(Token::Integer(value))
548 }
549 }
550
551 fn read_keyword(&mut self) -> ParseResult<Token> {
553 let word = self.read_word()?;
554 self.process_keyword(word)
555 }
556
557 fn process_keyword(&self, word: String) -> ParseResult<Token> {
559 match word.as_str() {
560 "stream" => Ok(Token::Stream),
561 "endstream" => Ok(Token::EndStream),
562 "obj" => Ok(Token::Obj),
563 "endobj" => Ok(Token::EndObj),
564 "startxref" => Ok(Token::StartXRef),
565 _ => Err(ParseError::SyntaxError {
566 position: self.position,
567 message: format!("Unknown keyword: {word}"),
568 }),
569 }
570 }
571
572 fn read_word(&mut self) -> ParseResult<String> {
574 let mut word = String::new();
575
576 while let Some(ch) = self.peek_char()? {
577 if ch.is_ascii_whitespace()
578 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
579 {
580 break;
581 }
582 self.consume_char()?;
583 word.push(ch as char);
584 }
585
586 Ok(word)
587 }
588
589 #[allow(dead_code)]
591 fn read_digits(&mut self) -> ParseResult<String> {
592 let mut digits = String::new();
593
594 while let Some(ch) = self.peek_char()? {
595 if ch.is_ascii_digit() {
596 self.consume_char()?;
597 digits.push(ch as char);
598 } else {
599 break;
600 }
601 }
602
603 Ok(digits)
604 }
605
606 pub fn read_newline(&mut self) -> ParseResult<()> {
608 match self.peek_char()? {
609 Some(b'\r') => {
610 self.consume_char()?;
611 if self.peek_char()? == Some(b'\n') {
613 self.consume_char()?;
614 }
615 Ok(())
616 }
617 Some(b'\n') => {
618 self.consume_char()?;
619 Ok(())
620 }
621 _ => Err(ParseError::SyntaxError {
622 position: self.position,
623 message: "Expected newline".to_string(),
624 }),
625 }
626 }
627
628 pub fn peek_byte(&mut self) -> ParseResult<u8> {
631 match self.peek_char()? {
632 Some(b) => Ok(b),
633 None => Err(ParseError::UnexpectedToken {
634 expected: "byte".to_string(),
635 found: "EOF".to_string(),
636 }),
637 }
638 }
639
640 pub fn read_byte(&mut self) -> ParseResult<u8> {
642 match self.consume_char()? {
643 Some(b) => Ok(b),
644 None => Err(ParseError::UnexpectedToken {
645 expected: "byte".to_string(),
646 found: "EOF".to_string(),
647 }),
648 }
649 }
650
651 pub fn seek(&mut self, pos: u64) -> ParseResult<()>
653 where
654 R: Seek,
655 {
656 self.reader.seek(SeekFrom::Start(pos))?;
657 self.position = pos as usize;
658 Ok(())
659 }
660
661 pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
662 let mut bytes = vec![0u8; n];
663 self.reader.read_exact(&mut bytes)?;
664 self.position += n;
665 Ok(bytes)
666 }
667
668 pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
670 let mut result = Vec::new();
671 let mut match_pos = 0;
672
673 while let Some(ch) = self.consume_char()? {
674 result.push(ch);
675
676 if ch == sequence[match_pos] {
677 match_pos += 1;
678 if match_pos == sequence.len() {
679 result.truncate(result.len() - sequence.len());
681 break;
682 }
683 } else if ch == sequence[0] {
684 match_pos = 1;
685 } else {
686 match_pos = 0;
687 }
688 }
689
690 if match_pos < sequence.len() {
691 return Err(ParseError::SyntaxError {
692 position: self.position,
693 message: format!("Sequence {sequence:?} not found"),
694 });
695 }
696
697 Ok(result)
698 }
699
700 pub fn position(&self) -> usize {
702 self.position
703 }
704
705 pub fn push_token(&mut self, token: Token) {
707 self.token_buffer.push(token);
708 }
709
710 pub fn expect_keyword(&mut self, keyword: &str) -> ParseResult<()> {
712 let token = self.next_token()?;
713 match (keyword, &token) {
714 ("endstream", Token::EndStream) => Ok(()),
715 ("stream", Token::Stream) => Ok(()),
716 ("endobj", Token::EndObj) => Ok(()),
717 ("obj", Token::Obj) => Ok(()),
718 ("startxref", Token::StartXRef) => Ok(()),
719 _ => Err(ParseError::UnexpectedToken {
720 expected: format!("keyword '{keyword}'"),
721 found: format!("{token:?}"),
722 }),
723 }
724 }
725
726 pub fn find_keyword_ahead(
729 &mut self,
730 keyword: &str,
731 max_bytes: usize,
732 ) -> ParseResult<Option<usize>>
733 where
734 R: Seek,
735 {
736 use std::io::{Read, Seek, SeekFrom};
737
738 let current_pos = self.reader.stream_position()?;
740 let start_buffer_state = self.peek_buffer;
741
742 let keyword_bytes = keyword.as_bytes();
743 let mut bytes_read = 0;
744 let mut match_buffer = Vec::new();
745
746 while bytes_read < max_bytes {
748 let mut byte = [0u8; 1];
749 match self.reader.read_exact(&mut byte) {
750 Ok(_) => {
751 bytes_read += 1;
752 match_buffer.push(byte[0]);
753
754 if match_buffer.len() > keyword_bytes.len() {
756 match_buffer.remove(0);
757 }
758
759 if match_buffer.len() == keyword_bytes.len() && match_buffer == keyword_bytes {
761 self.reader.seek(SeekFrom::Start(current_pos))?;
763 self.peek_buffer = start_buffer_state;
764 return Ok(Some(bytes_read - keyword_bytes.len()));
765 }
766 }
767 Err(_) => break, }
769 }
770
771 self.reader.seek(SeekFrom::Start(current_pos))?;
773 self.peek_buffer = start_buffer_state;
774 Ok(None)
775 }
776
777 pub fn peek_ahead(&mut self, n: usize) -> ParseResult<Vec<u8>>
779 where
780 R: Seek,
781 {
782 use std::io::{Read, Seek, SeekFrom};
783
784 let current_pos = self.reader.stream_position()?;
786 let start_buffer_state = self.peek_buffer;
787
788 let mut bytes = vec![0u8; n];
790 let bytes_read = self.reader.read(&mut bytes)?;
791 bytes.truncate(bytes_read);
792
793 self.reader.seek(SeekFrom::Start(current_pos))?;
795 self.peek_buffer = start_buffer_state;
796
797 Ok(bytes)
798 }
799
800 pub fn save_position(&mut self) -> ParseResult<(u64, Option<u8>)>
802 where
803 R: Seek,
804 {
805 use std::io::Seek;
806 let pos = self.reader.stream_position()?;
807 Ok((pos, self.peek_buffer))
808 }
809
810 pub fn restore_position(&mut self, saved: (u64, Option<u8>)) -> ParseResult<()>
812 where
813 R: Seek,
814 {
815 use std::io::{Seek, SeekFrom};
816 self.reader.seek(SeekFrom::Start(saved.0))?;
817 self.peek_buffer = saved.1;
818 self.position = saved.0 as usize;
819 Ok(())
820 }
821
822 pub fn peek_token(&mut self) -> ParseResult<Token>
824 where
825 R: Seek,
826 {
827 let saved_pos = self.save_position()?;
828 let token = self.next_token()?;
829 self.restore_position(saved_pos)?;
830 Ok(token)
831 }
832
833 fn process_string_with_encoding_recovery(
835 &mut self,
836 string_bytes: &[u8],
837 ) -> ParseResult<Vec<u8>> {
838 use super::encoding::{CharacterDecoder, EncodingOptions, EncodingType, EnhancedDecoder};
839
840 let has_problematic_chars = string_bytes.iter().any(|&b| {
842 (0x80..=0x9F).contains(&b)
844 || b == 0x07
845 || (b <= 0x1F && b != 0x09 && b != 0x0A && b != 0x0D)
846 });
847
848 let decoder = EnhancedDecoder::new();
849
850 let encoding_options = if has_problematic_chars {
852 EncodingOptions {
853 lenient_mode: true, preferred_encoding: Some(EncodingType::Windows1252), max_replacements: std::cmp::max(100, string_bytes.len() / 10), log_issues: self.options.collect_warnings,
857 }
858 } else {
859 EncodingOptions {
860 lenient_mode: self.options.lenient_encoding,
861 preferred_encoding: self.options.preferred_encoding,
862 max_replacements: 50,
863 log_issues: self.options.collect_warnings,
864 }
865 };
866
867 match decoder.decode(string_bytes, &encoding_options) {
868 Ok(result) => {
869 if (result.replacement_count > 0 || has_problematic_chars)
871 && self.options.collect_warnings
872 {
873 self.warnings.push(ParseWarning::InvalidEncoding {
874 position: self.position,
875 recovered_text: if result.text.len() > 50 {
876 let truncate_at = result
878 .text
879 .char_indices()
880 .map(|(i, _)| i)
881 .nth(47)
882 .unwrap_or(result.text.len().min(47));
883 format!(
884 "{}... (truncated, {} chars total)",
885 &result.text[..truncate_at],
886 result.text.chars().count()
887 )
888 } else {
889 result.text.clone()
890 },
891 encoding_used: result.detected_encoding,
892 replacement_count: result.replacement_count,
893 });
894 }
895
896 Ok(result.text.into_bytes())
898 }
899 Err(encoding_error) => {
900 if self.options.lenient_encoding {
901 let fallback_result = self.apply_fallback_encoding_strategy(string_bytes);
903
904 if self.options.collect_warnings {
905 self.warnings.push(ParseWarning::InvalidEncoding {
906 position: self.position,
907 recovered_text: format!(
908 "Fallback strategy applied: {} -> {} chars",
909 string_bytes.len(),
910 fallback_result.len()
911 ),
912 encoding_used: None,
913 replacement_count: string_bytes.len(),
914 });
915 }
916 Ok(fallback_result)
917 } else {
918 Err(ParseError::CharacterEncodingError {
919 position: self.position,
920 message: format!(
921 "Failed to decode string with any supported encoding: {encoding_error}"
922 ),
923 })
924 }
925 }
926 }
927 }
928
929 fn apply_fallback_encoding_strategy(&self, string_bytes: &[u8]) -> Vec<u8> {
931 let mut result = Vec::with_capacity(string_bytes.len());
932
933 for &byte in string_bytes {
934 match byte {
935 0x00..=0x08 | 0x0B | 0x0C | 0x0E..=0x1F => {
937 result.push(b' '); }
939 0x80..=0x9F => {
940 let replacement = match byte {
942 0x80 => b'E', 0x81 => b' ', 0x82 => b',', 0x83 => b'f', 0x84 => b'"', 0x85 => b'.', 0x86 => b'+', 0x87 => b'+', 0x88 => b'^', 0x89 => b'%', 0x8A => b'S', 0x8B => b'<', 0x8C => b'O', 0x8D => b' ', 0x8E => b'Z', 0x8F => b' ', 0x90 => b' ', 0x91 => b'\'', 0x92 => b'\'', 0x93 => b'"', 0x94 => b'"', 0x95 => b'*', 0x96 => b'-', 0x97 => b'-', 0x98 => b'~', 0x99 => b'T', 0x9A => b's', 0x9B => b'>', 0x9C => b'o', 0x9D => b' ', 0x9E => b'z', 0x9F => b'Y', _ => b'?', };
976 result.push(replacement);
977 }
978 _ => {
979 result.push(byte); }
981 }
982 }
983
984 result
985 }
986
987 fn is_problematic_encoding_char(&self, ch: u8) -> bool {
989 (0x80..=0x9F).contains(&ch) ||
991 ch == 0x07 || (ch <= 0x1F && ch != 0x09 && ch != 0x0A && ch != 0x0D) }
994
995 fn handle_encoding_char_in_token_stream(&mut self, ch: u8) -> ParseResult<Token> {
997 if self.options.lenient_encoding {
998 self.consume_char()?;
1000
1001 if self.options.collect_warnings {
1003 let replacement_char = match ch {
1004 0x07 => "bell",
1005 0x00..=0x1F => "control",
1006 0x80..=0x9F => "latin1-supplement",
1007 _ => "unknown",
1008 };
1009
1010 self.warnings.push(ParseWarning::InvalidEncoding {
1011 position: self.position,
1012 recovered_text: format!(
1013 "Skipped problematic {replacement_char} character (0x{ch:02X})"
1014 ),
1015 encoding_used: None,
1016 replacement_count: 1,
1017 });
1018 }
1019
1020 self.skip_whitespace()?;
1022 if let Ok(Some(_)) = self.peek_char() {
1023 self.next_token() } else {
1025 Err(ParseError::SyntaxError {
1026 position: self.position,
1027 message: "Unexpected end of file after problematic character".to_string(),
1028 })
1029 }
1030 } else {
1031 let char_description = match ch {
1033 0x07 => "Bell character (\\u{07})".to_string(),
1034 0x00..=0x1F => format!("Control character (\\u{{{ch:02X}}})"),
1035 0x80..=0x9F => format!("Latin-1 supplement character (\\u{{{ch:02X}}})"),
1036 _ => format!("Problematic character (\\u{{{ch:02X}}})"),
1037 };
1038
1039 Err(ParseError::CharacterEncodingError {
1040 position: self.position,
1041 message: format!(
1042 "Unexpected character: {char_description} - Consider using lenient parsing mode"
1043 ),
1044 })
1045 }
1046 }
1047}
1048
1049#[cfg(test)]
1050mod tests {
1051 use super::*;
1052 use std::io::Cursor;
1053
1054 #[test]
1055 fn test_lexer_basic_tokens() {
1056 let input = b"123 -456 3.14 true false null /Name";
1058 let mut lexer = Lexer::new(Cursor::new(input));
1059
1060 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1061 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
1062 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1063 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1064 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1065 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1066 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1067 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1068 }
1069
1070 #[test]
1071 fn test_lexer_negative_numbers() {
1072 let input = b"-123 -45.67";
1074 let mut lexer = Lexer::new(Cursor::new(input));
1075
1076 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
1077 assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
1078 }
1079
1080 #[test]
1081 fn test_lexer_strings() {
1082 let input = b"(Hello World) <48656C6C6F>";
1083 let mut lexer = Lexer::new(Cursor::new(input));
1084
1085 assert_eq!(
1086 lexer.next_token().unwrap(),
1087 Token::String(b"Hello World".to_vec())
1088 );
1089 assert_eq!(
1090 lexer.next_token().unwrap(),
1091 Token::String(b"Hello".to_vec())
1092 );
1093 }
1094
1095 #[test]
1096 fn test_lexer_dictionaries() {
1097 let input = b"<< /Type /Page >>";
1098 let mut lexer = Lexer::new(Cursor::new(input));
1099
1100 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1101 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1102 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1103 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1104 }
1105
1106 #[test]
1107 fn test_lexer_arrays() {
1108 let input = b"[1 2 3]";
1109 let mut lexer = Lexer::new(Cursor::new(input));
1110
1111 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1112 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1113 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1114 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1115 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1116 }
1117
1118 #[test]
1119 fn test_lexer_references() {
1120 let input = b"1 0 R 25 1 R";
1121 let mut lexer = Lexer::new(Cursor::new(input));
1122
1123 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1125 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1126 match lexer.next_token().unwrap() {
1128 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1130 }
1131
1132 assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
1133 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1134 match lexer.next_token().unwrap() {
1135 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1137 }
1138 }
1139
1140 #[test]
1141 fn test_lexer_comments() {
1142 let input = b"%PDF-1.7\n123";
1143 let mut lexer = Lexer::new(Cursor::new(input));
1144
1145 assert_eq!(
1146 lexer.next_token().unwrap(),
1147 Token::Comment("PDF-1.7".to_string())
1148 );
1149 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1150 }
1151
1152 mod comprehensive_tests {
1154 use super::*;
1155 use std::io::Cursor;
1156
1157 #[test]
1158 fn test_token_debug_trait() {
1159 let token = Token::Integer(42);
1160 let debug_str = format!("{token:?}");
1161 assert!(debug_str.contains("Integer"));
1162 assert!(debug_str.contains("42"));
1163 }
1164
1165 #[test]
1166 fn test_token_clone() {
1167 let token = Token::String(b"hello".to_vec());
1168 let cloned = token.clone();
1169 assert_eq!(token, cloned);
1170 }
1171
1172 #[test]
1173 fn test_token_equality() {
1174 assert_eq!(Token::Integer(42), Token::Integer(42));
1175 assert_ne!(Token::Integer(42), Token::Integer(43));
1176 assert_eq!(Token::Boolean(true), Token::Boolean(true));
1177 assert_ne!(Token::Boolean(true), Token::Boolean(false));
1178 assert_eq!(Token::Null, Token::Null);
1179 assert_ne!(Token::Null, Token::Integer(0));
1180 }
1181
1182 #[test]
1183 fn test_lexer_empty_input() {
1184 let input = b"";
1185 let mut lexer = Lexer::new(Cursor::new(input));
1186 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1187 }
1188
1189 #[test]
1190 fn test_lexer_whitespace_only() {
1191 let input = b" \t\n\r ";
1192 let mut lexer = Lexer::new(Cursor::new(input));
1193 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1194 }
1195
1196 #[test]
1197 fn test_lexer_integer_edge_cases() {
1198 let input = b"0 +123 -0 9876543210";
1199 let mut lexer = Lexer::new(Cursor::new(input));
1200
1201 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1202 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1203 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1204 assert_eq!(lexer.next_token().unwrap(), Token::Integer(9876543210));
1205 }
1206
1207 #[test]
1208 fn test_lexer_real_edge_cases() {
1209 let input = b"0.0 +3.14 -2.71828 .5 5. 123.456789";
1210 let mut lexer = Lexer::new(Cursor::new(input));
1211
1212 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.0));
1213 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1214 assert_eq!(lexer.next_token().unwrap(), Token::Real(-2.71828));
1215 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.5));
1216 assert_eq!(lexer.next_token().unwrap(), Token::Real(5.0));
1217 assert_eq!(lexer.next_token().unwrap(), Token::Real(123.456789));
1218 }
1219
1220 #[test]
1221 fn test_lexer_scientific_notation() {
1222 let input = b"1.23e10 -4.56E-5 1e0 2E+3";
1223 let mut lexer = Lexer::new(Cursor::new(input));
1224
1225 assert_eq!(lexer.next_token().unwrap(), Token::Real(1.23e10));
1226 assert_eq!(lexer.next_token().unwrap(), Token::Real(-4.56e-5));
1227 assert_eq!(lexer.next_token().unwrap(), Token::Real(1e0));
1228 assert_eq!(lexer.next_token().unwrap(), Token::Real(2e3));
1229 }
1230
1231 #[test]
1232 fn test_lexer_string_literal_escapes() {
1233 let input = b"(Hello\\nWorld) (Tab\\tChar) (Quote\\\"Mark) (Backslash\\\\)";
1234 let mut lexer = Lexer::new(Cursor::new(input));
1235
1236 assert_eq!(
1237 lexer.next_token().unwrap(),
1238 Token::String(b"Hello\nWorld".to_vec())
1239 );
1240 assert_eq!(
1241 lexer.next_token().unwrap(),
1242 Token::String(b"Tab\tChar".to_vec())
1243 );
1244 assert_eq!(
1245 lexer.next_token().unwrap(),
1246 Token::String(b"Quote\"Mark".to_vec())
1247 );
1248 assert_eq!(
1249 lexer.next_token().unwrap(),
1250 Token::String(b"Backslash\\".to_vec())
1251 );
1252 }
1253
1254 #[test]
1255 fn test_lexer_string_literal_nested_parens() {
1256 let input = b"(Nested (parentheses) work)";
1257 let mut lexer = Lexer::new(Cursor::new(input));
1258
1259 assert_eq!(
1260 lexer.next_token().unwrap(),
1261 Token::String(b"Nested (parentheses) work".to_vec())
1262 );
1263 }
1264
1265 #[test]
1266 fn test_lexer_string_literal_empty() {
1267 let input = b"()";
1268 let mut lexer = Lexer::new(Cursor::new(input));
1269
1270 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1271 }
1272
1273 #[test]
1274 fn test_lexer_hexadecimal_strings() {
1275 let input = b"<48656C6C6F> <20576F726C64> <>";
1276 let mut lexer = Lexer::new(Cursor::new(input));
1277
1278 assert_eq!(
1279 lexer.next_token().unwrap(),
1280 Token::String(b"Hello".to_vec())
1281 );
1282 assert_eq!(
1283 lexer.next_token().unwrap(),
1284 Token::String(b" World".to_vec())
1285 );
1286 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1287 }
1288
1289 #[test]
1290 fn test_lexer_hexadecimal_strings_odd_length() {
1291 let input = b"<48656C6C6F2> <1> <ABC>";
1292 let mut lexer = Lexer::new(Cursor::new(input));
1293
1294 assert_eq!(
1296 lexer.next_token().unwrap(),
1297 Token::String(b"Hello ".to_vec())
1298 );
1299 assert_eq!(lexer.next_token().unwrap(), Token::String(b"\x10".to_vec()));
1300 assert_eq!(
1301 lexer.next_token().unwrap(),
1302 Token::String(b"\xAB\xC0".to_vec())
1303 );
1304 }
1305
1306 #[test]
1307 fn test_lexer_hexadecimal_strings_whitespace() {
1308 let input = b"<48 65 6C 6C 6F>";
1309 let mut lexer = Lexer::new(Cursor::new(input));
1310
1311 assert_eq!(
1312 lexer.next_token().unwrap(),
1313 Token::String(b"Hello".to_vec())
1314 );
1315 }
1316
1317 #[test]
1318 fn test_lexer_names() {
1319 let input = b"/Type /Page /Root /Kids /Count /MediaBox";
1320 let mut lexer = Lexer::new(Cursor::new(input));
1321
1322 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1323 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1324 assert_eq!(lexer.next_token().unwrap(), Token::Name("Root".to_string()));
1325 assert_eq!(lexer.next_token().unwrap(), Token::Name("Kids".to_string()));
1326 assert_eq!(
1327 lexer.next_token().unwrap(),
1328 Token::Name("Count".to_string())
1329 );
1330 assert_eq!(
1331 lexer.next_token().unwrap(),
1332 Token::Name("MediaBox".to_string())
1333 );
1334 }
1335
1336 #[test]
1337 fn test_lexer_names_with_special_chars() {
1338 let input = b"/Name#20with#20spaces /Name#2Fwith#2Fslashes";
1339 let mut lexer = Lexer::new(Cursor::new(input));
1340
1341 assert_eq!(
1342 lexer.next_token().unwrap(),
1343 Token::Name("Name with spaces".to_string())
1344 );
1345 assert_eq!(
1346 lexer.next_token().unwrap(),
1347 Token::Name("Name/with/slashes".to_string())
1348 );
1349 }
1350
1351 #[test]
1352 fn test_lexer_names_edge_cases() {
1353 let input = b"/ /A /123 /true /false /null";
1354 let mut lexer = Lexer::new(Cursor::new(input));
1355
1356 assert_eq!(lexer.next_token().unwrap(), Token::Name("".to_string()));
1357 assert_eq!(lexer.next_token().unwrap(), Token::Name("A".to_string()));
1358 assert_eq!(lexer.next_token().unwrap(), Token::Name("123".to_string()));
1359 assert_eq!(lexer.next_token().unwrap(), Token::Name("true".to_string()));
1360 assert_eq!(
1361 lexer.next_token().unwrap(),
1362 Token::Name("false".to_string())
1363 );
1364 assert_eq!(lexer.next_token().unwrap(), Token::Name("null".to_string()));
1365 }
1366
1367 #[test]
1368 fn test_lexer_nested_dictionaries() {
1369 let input = b"<< /Type /Page /Resources << /Font << /F1 123 0 R >> >> >>";
1370 let mut lexer = Lexer::new(Cursor::new(input));
1371
1372 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1373 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1374 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1375 assert_eq!(
1376 lexer.next_token().unwrap(),
1377 Token::Name("Resources".to_string())
1378 );
1379 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1380 assert_eq!(lexer.next_token().unwrap(), Token::Name("Font".to_string()));
1381 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1382 assert_eq!(lexer.next_token().unwrap(), Token::Name("F1".to_string()));
1383 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1384 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1385 assert_eq!(lexer.next_token().unwrap(), Token::Name("R".to_string()));
1386 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1387 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1388 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1389 }
1390
1391 #[test]
1392 fn test_lexer_nested_arrays() {
1393 let input = b"[[1 2] [3 4] [5 [6 7]]]";
1394 let mut lexer = Lexer::new(Cursor::new(input));
1395
1396 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1397 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1398 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1399 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1400 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1401 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1402 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1403 assert_eq!(lexer.next_token().unwrap(), Token::Integer(4));
1404 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1405 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1406 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1407 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1408 assert_eq!(lexer.next_token().unwrap(), Token::Integer(6));
1409 assert_eq!(lexer.next_token().unwrap(), Token::Integer(7));
1410 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1411 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1412 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1413 }
1414
1415 #[test]
1416 fn test_lexer_mixed_content() {
1417 let input = b"<< /Type /Page /MediaBox [0 0 612 792] /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 >> >> >> >>";
1418 let mut lexer = Lexer::new(Cursor::new(input));
1419
1420 let mut tokens = Vec::new();
1422 loop {
1423 match lexer.next_token().unwrap() {
1424 Token::Eof => break,
1425 token => tokens.push(token),
1426 }
1427 }
1428 assert!(tokens.len() > 10);
1429 }
1430
1431 #[test]
1432 fn test_lexer_keywords() {
1433 let input = b"obj endobj stream endstream startxref";
1434 let mut lexer = Lexer::new(Cursor::new(input));
1435
1436 assert_eq!(lexer.next_token().unwrap(), Token::Obj);
1437 assert_eq!(lexer.next_token().unwrap(), Token::EndObj);
1438 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1439 assert_eq!(lexer.next_token().unwrap(), Token::EndStream);
1440 assert_eq!(lexer.next_token().unwrap(), Token::StartXRef);
1441 }
1442
1443 #[test]
1444 fn test_lexer_multiple_comments() {
1445 let input = b"%First comment\n%Second comment\n123";
1446 let mut lexer = Lexer::new(Cursor::new(input));
1447
1448 assert_eq!(
1449 lexer.next_token().unwrap(),
1450 Token::Comment("First comment".to_string())
1451 );
1452 assert_eq!(
1453 lexer.next_token().unwrap(),
1454 Token::Comment("Second comment".to_string())
1455 );
1456 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1457 }
1458
1459 #[test]
1460 fn test_lexer_comment_without_newline() {
1461 let input = b"%Comment at end";
1462 let mut lexer = Lexer::new(Cursor::new(input));
1463
1464 assert_eq!(
1465 lexer.next_token().unwrap(),
1466 Token::Comment("Comment at end".to_string())
1467 );
1468 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1469 }
1470
1471 #[test]
1472 fn test_lexer_special_characters_in_streams() {
1473 let input = b"<< /Length 5 >> stream\nHello endstream";
1474 let mut lexer = Lexer::new(Cursor::new(input));
1475
1476 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1477 assert_eq!(
1478 lexer.next_token().unwrap(),
1479 Token::Name("Length".to_string())
1480 );
1481 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1482 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1483 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1484 }
1486
1487 #[test]
1488 fn test_lexer_push_token() {
1489 let input = b"123 456";
1490 let mut lexer = Lexer::new(Cursor::new(input));
1491
1492 let token1 = lexer.next_token().unwrap();
1493 assert_eq!(token1, Token::Integer(123));
1494
1495 let token2 = lexer.next_token().unwrap();
1496 assert_eq!(token2, Token::Integer(456));
1497
1498 lexer.push_token(token2.clone());
1500
1501 let token3 = lexer.next_token().unwrap();
1503 assert_eq!(token3, token2);
1504
1505 let token4 = lexer.next_token().unwrap();
1507 assert_eq!(token4, Token::Eof);
1508 }
1509
1510 #[test]
1511 fn test_lexer_push_multiple_tokens() {
1512 let input = b"123";
1513 let mut lexer = Lexer::new(Cursor::new(input));
1514
1515 let original_token = lexer.next_token().unwrap();
1516 assert_eq!(original_token, Token::Integer(123));
1517
1518 lexer.push_token(Token::Boolean(true));
1520 lexer.push_token(Token::Boolean(false));
1521 lexer.push_token(Token::Null);
1522
1523 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1525 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1526 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1527 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1528 }
1529
1530 #[test]
1531 fn test_lexer_read_newline() {
1532 let input = b"123\n456\r\n789";
1533 let mut lexer = Lexer::new(Cursor::new(input));
1534
1535 let digits1 = lexer.read_digits().unwrap();
1537 assert_eq!(digits1, "123");
1538 assert!(lexer.read_newline().is_ok());
1539
1540 let digits2 = lexer.read_digits().unwrap();
1542 assert_eq!(digits2, "456");
1543 assert!(lexer.read_newline().is_ok());
1544
1545 let digits3 = lexer.read_digits().unwrap();
1547 assert_eq!(digits3, "789");
1548 }
1549
1550 #[test]
1551 fn test_lexer_read_bytes() {
1552 let input = b"Hello World";
1553 let mut lexer = Lexer::new(Cursor::new(input));
1554
1555 let bytes = lexer.read_bytes(5).unwrap();
1556 assert_eq!(bytes, b"Hello");
1557
1558 let bytes = lexer.read_bytes(6).unwrap();
1559 assert_eq!(bytes, b" World");
1560 }
1561
1562 #[test]
1563 fn test_lexer_read_until_sequence() {
1564 let input = b"Hello endstream World";
1565 let mut lexer = Lexer::new(Cursor::new(input));
1566
1567 let result = lexer.read_until_sequence(b"endstream").unwrap();
1568 assert_eq!(result, b"Hello ");
1569
1570 let rest = lexer.read_digits().unwrap();
1572 assert_eq!(rest, ""); }
1574
1575 #[test]
1576 fn test_lexer_read_until_sequence_not_found() {
1577 let input = b"Hello World";
1578 let mut lexer = Lexer::new(Cursor::new(input));
1579
1580 let result = lexer.read_until_sequence(b"notfound");
1581 assert!(result.is_err());
1582 }
1583
1584 #[test]
1585 fn test_lexer_position_tracking() {
1586 let input = b"123 456";
1587 let mut lexer = Lexer::new(Cursor::new(input));
1588
1589 let initial_pos = lexer.position();
1590 assert_eq!(initial_pos, 0);
1591
1592 lexer.next_token().unwrap(); let pos_after_first = lexer.position();
1594 assert!(pos_after_first > initial_pos);
1595
1596 lexer.next_token().unwrap(); let pos_after_second = lexer.position();
1598 assert!(pos_after_second > pos_after_first);
1599 }
1600
1601 #[test]
1602 fn test_lexer_large_numbers() {
1603 let input = b"2147483647 -2147483648 9223372036854775807 -9223372036854775808";
1604 let mut lexer = Lexer::new(Cursor::new(input));
1605
1606 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2147483647));
1607 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-2147483648));
1608 assert_eq!(
1609 lexer.next_token().unwrap(),
1610 Token::Integer(9223372036854775807)
1611 );
1612 assert_eq!(
1613 lexer.next_token().unwrap(),
1614 Token::Integer(-9223372036854775808)
1615 );
1616 }
1617
1618 #[test]
1619 fn test_lexer_very_long_string() {
1620 let long_str = "A".repeat(1000);
1621 let input = format!("({long_str})");
1622 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1623
1624 if let Token::String(s) = lexer.next_token().unwrap() {
1625 assert_eq!(s.len(), 1000);
1626 assert_eq!(s, long_str.as_bytes());
1627 } else {
1628 panic!("Expected string token");
1629 }
1630 }
1631
1632 #[test]
1633 fn test_lexer_very_long_name() {
1634 let long_name = "A".repeat(500);
1635 let input = format!("/{long_name}");
1636 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1637
1638 if let Token::Name(name) = lexer.next_token().unwrap() {
1639 assert_eq!(name.len(), 500);
1640 assert_eq!(name, long_name);
1641 } else {
1642 panic!("Expected name token");
1643 }
1644 }
1645
1646 #[test]
1647 fn test_lexer_error_handling_invalid_hex() {
1648 let input = b"<48656C6C6FG>";
1649 let mut lexer = Lexer::new(Cursor::new(input));
1650
1651 let result = lexer.next_token();
1653 assert!(result.is_ok() || result.is_err()); }
1655
1656 #[test]
1657 fn test_lexer_all_token_types() {
1658 let input = b"true false null 123 -456 3.14 (string) <48656C6C6F> /Name [ ] << >> obj endobj stream endstream startxref % comment\n";
1659 let mut lexer = Lexer::new(Cursor::new(input));
1660
1661 let mut token_types = Vec::new();
1662 loop {
1663 match lexer.next_token().unwrap() {
1664 Token::Eof => break,
1665 token => token_types.push(std::mem::discriminant(&token)),
1666 }
1667 }
1668
1669 assert!(token_types.len() > 10);
1671 }
1672
1673 #[test]
1674 fn test_lexer_performance() {
1675 let input = "123 456 789 ".repeat(1000);
1676 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1677
1678 let start_time = std::time::Instant::now();
1679 let mut count = 0;
1680 loop {
1681 match lexer.next_token().unwrap() {
1682 Token::Eof => break,
1683 _ => count += 1,
1684 }
1685 }
1686 let elapsed = start_time.elapsed();
1687
1688 assert_eq!(count, 3000); assert!(elapsed.as_millis() < 1000); }
1691 }
1692
1693 #[test]
1694 fn test_lexer_find_keyword_ahead() {
1695 let input = b"some data here endstream more data";
1696 let mut lexer = Lexer::new(Cursor::new(input));
1697
1698 let result = lexer.find_keyword_ahead("endstream", 100);
1700 assert!(result.is_ok());
1701 assert_eq!(result.unwrap(), Some(15)); let result2 = lexer.find_keyword_ahead("notfound", 100);
1705 assert!(result2.is_ok());
1706 assert_eq!(result2.unwrap(), None);
1707
1708 let result3 = lexer.find_keyword_ahead("endstream", 10);
1710 assert!(result3.is_ok());
1711 assert_eq!(result3.unwrap(), None); }
1713
1714 #[test]
1715 fn test_lexer_peek_token() {
1716 let input = b"123 456 /Name";
1717 let mut lexer = Lexer::new(Cursor::new(input));
1718
1719 let peeked = lexer.peek_token();
1721 assert!(peeked.is_ok());
1722 assert_eq!(peeked.unwrap(), Token::Integer(123));
1723
1724 let next = lexer.next_token();
1726 assert!(next.is_ok());
1727 assert_eq!(next.unwrap(), Token::Integer(123));
1728
1729 assert_eq!(lexer.peek_token().unwrap(), Token::Integer(456));
1731 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1732
1733 assert_eq!(lexer.peek_token().unwrap(), Token::Name("Name".to_string()));
1734 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1735 }
1736
1737 #[test]
1738 fn test_lexer_expect_keyword() {
1739 let input = b"endstream obj endobj";
1740 let mut lexer = Lexer::new(Cursor::new(input));
1741
1742 assert!(lexer.expect_keyword("endstream").is_ok());
1744
1745 assert!(lexer.expect_keyword("obj").is_ok());
1747
1748 let result = lexer.expect_keyword("stream");
1750 assert!(result.is_err());
1751 match result {
1752 Err(ParseError::UnexpectedToken { expected, found }) => {
1753 assert!(expected.contains("stream"));
1754 assert!(found.contains("EndObj"));
1755 }
1756 _ => panic!("Expected UnexpectedToken error"),
1757 }
1758 }
1759
1760 #[test]
1761 fn test_lexer_save_restore_position() {
1762 let input = b"123 456 789";
1763 let mut lexer = Lexer::new(Cursor::new(input));
1764
1765 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1767
1768 let saved = lexer.save_position();
1770 assert!(saved.is_ok());
1771 let saved_pos = saved.unwrap();
1772
1773 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1775 assert_eq!(lexer.next_token().unwrap(), Token::Integer(789));
1776
1777 assert!(lexer.restore_position(saved_pos).is_ok());
1779
1780 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1782 }
1783
1784 #[test]
1785 fn test_lexer_character_encoding_recovery() {
1786 let input = b"(Caf\x80 \x91Hello\x92)"; let options = ParseOptions::lenient();
1789 let mut lexer = Lexer::new_with_options(Cursor::new(input), options);
1790
1791 match lexer.next_token().unwrap() {
1792 Token::String(bytes) => {
1793 let text = String::from_utf8_lossy(&bytes);
1795 println!("Recovered text: {text}");
1796 assert!(!text.is_empty()); }
1798 other => panic!("Expected String token, got {other:?}"),
1799 }
1800
1801 let warnings = lexer.warnings();
1803 if !warnings.is_empty() {
1804 println!("Encoding warnings: {warnings:?}");
1805 }
1806 }
1807}