1use super::{ParseError, ParseOptions, ParseResult, ParseWarning};
6use std::io::{Read, Seek, SeekFrom};
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Boolean(bool),
13
14 Integer(i64),
16
17 Real(f64),
19
20 String(Vec<u8>),
22
23 Name(String),
25
26 ArrayStart,
28
29 ArrayEnd,
31
32 DictStart,
34
35 DictEnd,
37
38 Stream,
40
41 EndStream,
43
44 Obj,
46
47 EndObj,
49
50 StartXRef,
52
53 Reference(u32, u16),
55
56 Null,
58
59 Comment(String),
61
62 Eof,
64}
65
66pub struct Lexer<R> {
68 reader: std::io::BufReader<R>,
69 #[allow(dead_code)]
70 buffer: Vec<u8>,
71 position: usize,
72 peek_buffer: Option<u8>,
73 token_buffer: Vec<Token>,
74 options: ParseOptions,
75 warnings: Vec<ParseWarning>,
76}
77
78impl<R: Read> Lexer<R> {
79 pub fn new(reader: R) -> Self {
81 Self::new_with_options(reader, ParseOptions::default())
82 }
83
84 pub fn new_with_options(reader: R, options: ParseOptions) -> Self {
86 Self {
87 reader: std::io::BufReader::new(reader),
88 buffer: Vec::with_capacity(1024),
89 position: 0,
90 peek_buffer: None,
91 token_buffer: Vec::new(),
92 options,
93 warnings: Vec::new(),
94 }
95 }
96
97 pub fn warnings(&self) -> &[ParseWarning] {
99 &self.warnings
100 }
101
102 pub fn next_token(&mut self) -> ParseResult<Token> {
104 if let Some(token) = self.token_buffer.pop() {
106 return Ok(token);
107 }
108
109 self.skip_whitespace()?;
110
111 let ch = match self.peek_char()? {
112 Some(ch) => ch,
113 None => return Ok(Token::Eof),
114 };
115
116 match ch {
117 b'%' => self.read_comment(),
118 b'/' => self.read_name(),
119 b'(' => self.read_literal_string(),
120 b'<' => self.read_angle_bracket(),
121 b'>' => {
122 self.consume_char()?;
123 if self.peek_char()? == Some(b'>') {
124 self.consume_char()?;
125 Ok(Token::DictEnd)
126 } else {
127 Err(ParseError::SyntaxError {
128 position: self.position,
129 message: "Expected '>' after '>'".to_string(),
130 })
131 }
132 }
133 b'[' => {
134 self.consume_char()?;
135 Ok(Token::ArrayStart)
136 }
137 b']' => {
138 self.consume_char()?;
139 Ok(Token::ArrayEnd)
140 }
141 b't' | b'f' => self.read_boolean(),
142 b'n' => self.read_null(),
143 b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
144 b'R' => {
145 self.consume_char()?;
147 Ok(Token::Name("R".to_string()))
148 }
149 _ if ch.is_ascii_alphabetic() => self.read_keyword(),
150 b';' => {
151 self.consume_char()?;
153 self.next_token() }
155 _ => {
156 if self.is_problematic_encoding_char(ch) {
158 self.handle_encoding_char_in_token_stream(ch)
159 } else if self.options.lenient_syntax {
160 if self.options.collect_warnings {
162 tracing::debug!(
163 "Warning: Skipping unexpected character '{}' at position {}",
164 ch as char,
165 self.position
166 );
167 }
168 self.consume_char()?;
169 self.next_token() } else {
171 Err(ParseError::SyntaxError {
172 position: self.position,
173 message: format!("Unexpected character: {}", ch as char),
174 })
175 }
176 }
177 }
178 }
179
180 fn peek_char(&mut self) -> ParseResult<Option<u8>> {
182 if let Some(ch) = self.peek_buffer {
183 return Ok(Some(ch));
184 }
185
186 let mut buf = [0u8; 1];
187 match self.reader.read_exact(&mut buf) {
188 Ok(_) => {
189 self.peek_buffer = Some(buf[0]);
190 Ok(Some(buf[0]))
191 }
192 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
193 Err(e) => Err(e.into()),
194 }
195 }
196
197 fn consume_char(&mut self) -> ParseResult<Option<u8>> {
199 let ch = self.peek_char()?;
200 if ch.is_some() {
201 self.peek_buffer = None;
202 self.position += 1;
203 }
204 Ok(ch)
205 }
206
207 pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
209 let mut count = 0;
210 while let Some(ch) = self.peek_char()? {
211 if ch.is_ascii_whitespace() {
212 self.consume_char()?;
213 count += 1;
214 } else {
215 break;
216 }
217 }
218 Ok(count)
219 }
220
221 fn read_comment(&mut self) -> ParseResult<Token> {
223 self.consume_char()?; let mut comment = String::new();
225
226 while let Some(ch) = self.peek_char()? {
227 if ch == b'\n' || ch == b'\r' {
228 break;
229 }
230 self.consume_char()?;
231 comment.push(ch as char);
232 }
233
234 Ok(Token::Comment(comment))
235 }
236
237 fn read_name(&mut self) -> ParseResult<Token> {
239 self.consume_char()?; let mut name = String::new();
241
242 while let Some(ch) = self.peek_char()? {
243 if ch.is_ascii_whitespace()
244 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
245 {
246 break;
247 }
248 self.consume_char()?;
249
250 if ch == b'#' {
252 let hex1 = self
253 .consume_char()?
254 .ok_or_else(|| ParseError::SyntaxError {
255 position: self.position,
256 message: "Incomplete hex code in name".to_string(),
257 })?;
258 let hex2 = self
259 .consume_char()?
260 .ok_or_else(|| ParseError::SyntaxError {
261 position: self.position,
262 message: "Incomplete hex code in name".to_string(),
263 })?;
264
265 let value = u8::from_str_radix(&format!("{}{}", hex1 as char, hex2 as char), 16)
266 .map_err(|_| ParseError::SyntaxError {
267 position: self.position,
268 message: "Invalid hex code in name".to_string(),
269 })?;
270
271 name.push(value as char);
272 } else {
273 name.push(ch as char);
274 }
275 }
276
277 Ok(Token::Name(name))
278 }
279
280 fn read_literal_string(&mut self) -> ParseResult<Token> {
282 self.consume_char()?; let mut string = Vec::new();
284 let mut paren_depth = 1;
285 let mut escape = false;
286
287 while paren_depth > 0 {
288 let ch = match self.consume_char()? {
289 Some(c) => c,
290 None => {
291 if self.options.lenient_syntax {
292 if self.options.collect_warnings {
294 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
295 position: self.position,
296 expected: "closing parenthesis".to_string(),
297 found: "EOF".to_string(),
298 recovery_action: "returned partial string content".to_string(),
299 });
300 }
301 break;
302 } else {
303 return Err(ParseError::SyntaxError {
304 position: self.position,
305 message: "Unterminated string".to_string(),
306 });
307 }
308 }
309 };
310
311 if escape {
312 let escaped = match ch {
313 b'n' => b'\n',
314 b'r' => b'\r',
315 b't' => b'\t',
316 b'b' => b'\x08',
317 b'f' => b'\x0C',
318 b'(' => b'(',
319 b')' => b')',
320 b'\\' => b'\\',
321 b'0'..=b'7' => {
322 let mut value = u16::from(ch - b'0');
326 for _ in 0..2 {
327 if let Some(next) = self.peek_char()? {
328 if matches!(next, b'0'..=b'7') {
329 self.consume_char()?;
330 value = value * 8 + u16::from(next - b'0');
331 } else {
332 break;
333 }
334 }
335 }
336 value as u8
337 }
338 _ => ch, };
340 string.push(escaped);
341 escape = false;
342 } else {
343 match ch {
344 b'\\' => escape = true,
345 b'(' => {
346 string.push(ch);
347 paren_depth += 1;
348 }
349 b')' => {
350 paren_depth -= 1;
351 if paren_depth > 0 {
352 string.push(ch);
353 }
354 }
355 _ => string.push(ch),
356 }
357 }
358 }
359
360 let processed_string = if self.options.lenient_encoding {
362 self.process_string_with_encoding_recovery(&string)?
363 } else {
364 string
365 };
366
367 Ok(Token::String(processed_string))
368 }
369
370 fn read_angle_bracket(&mut self) -> ParseResult<Token> {
372 self.consume_char()?; if self.peek_char()? == Some(b'<') {
375 self.consume_char()?;
376 Ok(Token::DictStart)
377 } else {
378 let mut hex_chars = String::new();
380 let mut found_end = false;
381
382 while let Some(ch) = self.peek_char()? {
383 if ch == b'>' {
384 self.consume_char()?;
385 found_end = true;
386 break;
387 }
388 self.consume_char()?;
389 if ch.is_ascii_hexdigit() {
390 hex_chars.push(ch as char);
391 } else if !ch.is_ascii_whitespace() {
392 if self.options.lenient_syntax {
393 if self.options.collect_warnings {
395 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
396 position: self.position,
397 expected: "hex digit".to_string(),
398 found: format!("'{}'", ch as char),
399 recovery_action: "skipped invalid character".to_string(),
400 });
401 }
402 } else {
403 return Err(ParseError::SyntaxError {
404 position: self.position,
405 message: "Invalid character in hex string".to_string(),
406 });
407 }
408 }
409 }
410
411 if !found_end {
412 if self.options.lenient_syntax {
413 if self.options.collect_warnings {
415 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
416 position: self.position,
417 expected: ">".to_string(),
418 found: "EOF".to_string(),
419 recovery_action: "returned partial hex string".to_string(),
420 });
421 }
422 } else {
423 return Err(ParseError::SyntaxError {
424 position: self.position,
425 message: "Unterminated hex string".to_string(),
426 });
427 }
428 }
429
430 if hex_chars.len() % 2 != 0 {
432 hex_chars.push('0');
433 }
434
435 let mut bytes = Vec::new();
437 for chunk in hex_chars.as_bytes().chunks(2) {
438 let hex_str = std::str::from_utf8(chunk).map_err(|_| ParseError::SyntaxError {
439 position: self.position,
440 message: "Invalid UTF-8 in hex string".to_string(),
441 })?;
442 let byte =
443 u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
444 position: self.position,
445 message: "Invalid hex string".to_string(),
446 })?;
447 bytes.push(byte);
448 }
449
450 Ok(Token::String(bytes))
451 }
452 }
453
454 fn read_boolean(&mut self) -> ParseResult<Token> {
456 let word = self.read_word()?;
457 match word.as_str() {
458 "true" => Ok(Token::Boolean(true)),
459 "false" => Ok(Token::Boolean(false)),
460 _ => {
461 self.process_keyword(word)
463 }
464 }
465 }
466
467 fn read_null(&mut self) -> ParseResult<Token> {
469 let word = self.read_word()?;
470 if word == "null" {
471 Ok(Token::Null)
472 } else {
473 self.process_keyword(word)
475 }
476 }
477
478 fn read_number(&mut self) -> ParseResult<Token> {
480 let mut number_str = String::new();
481 let mut has_dot = false;
482
483 if let Some(ch) = self.peek_char()? {
485 if ch == b'+' || ch == b'-' {
486 self.consume_char()?;
487 number_str.push(ch as char);
488
489 if let Some(next) = self.peek_char()? {
491 if !next.is_ascii_digit() && next != b'.' {
492 return Err(ParseError::SyntaxError {
493 position: self.position,
494 message: "Expected digit after sign".to_string(),
495 });
496 }
497 }
498 }
499 }
500
501 while let Some(ch) = self.peek_char()? {
503 match ch {
504 b'0'..=b'9' => {
505 self.consume_char()?;
506 number_str.push(ch as char);
507 }
508 b'.' if !has_dot => {
509 self.consume_char()?;
510 number_str.push(ch as char);
511 has_dot = true;
512 }
513 _ => break,
514 }
515 }
516
517 if let Some(ch) = self.peek_char()? {
519 if ch == b'e' || ch == b'E' {
520 self.consume_char()?;
521 number_str.push(ch as char);
522
523 if let Some(sign_ch) = self.peek_char()? {
525 if sign_ch == b'+' || sign_ch == b'-' {
526 self.consume_char()?;
527 number_str.push(sign_ch as char);
528 }
529 }
530
531 while let Some(digit_ch) = self.peek_char()? {
533 if digit_ch.is_ascii_digit() {
534 self.consume_char()?;
535 number_str.push(digit_ch as char);
536 } else {
537 break;
538 }
539 }
540
541 has_dot = true;
543 }
544 }
545
546 if has_dot {
551 let value = number_str
552 .parse::<f64>()
553 .map_err(|_| ParseError::SyntaxError {
554 position: self.position,
555 message: format!("Invalid real number: '{number_str}'"),
556 })?;
557 Ok(Token::Real(value))
558 } else {
559 let value = number_str
560 .parse::<i64>()
561 .map_err(|_| ParseError::SyntaxError {
562 position: self.position,
563 message: format!("Invalid integer: '{number_str}'"),
564 })?;
565 Ok(Token::Integer(value))
566 }
567 }
568
569 fn read_keyword(&mut self) -> ParseResult<Token> {
571 let word = self.read_word()?;
572 self.process_keyword(word)
573 }
574
575 fn process_keyword(&self, word: String) -> ParseResult<Token> {
577 match word.as_str() {
578 "stream" => Ok(Token::Stream),
579 "endstream" => Ok(Token::EndStream),
580 "obj" => Ok(Token::Obj),
581 "endobj" => Ok(Token::EndObj),
582 "startxref" => Ok(Token::StartXRef),
583 _ => Err(ParseError::SyntaxError {
584 position: self.position,
585 message: format!("Unknown keyword: {word}"),
586 }),
587 }
588 }
589
590 fn read_word(&mut self) -> ParseResult<String> {
592 let mut word = String::new();
593
594 while let Some(ch) = self.peek_char()? {
595 if ch.is_ascii_whitespace()
596 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
597 {
598 break;
599 }
600 self.consume_char()?;
601 word.push(ch as char);
602 }
603
604 Ok(word)
605 }
606
607 #[allow(dead_code)]
609 fn read_digits(&mut self) -> ParseResult<String> {
610 let mut digits = String::new();
611
612 while let Some(ch) = self.peek_char()? {
613 if ch.is_ascii_digit() {
614 self.consume_char()?;
615 digits.push(ch as char);
616 } else {
617 break;
618 }
619 }
620
621 Ok(digits)
622 }
623
624 pub fn read_newline(&mut self) -> ParseResult<()> {
626 match self.peek_char()? {
627 Some(b'\r') => {
628 self.consume_char()?;
629 if self.peek_char()? == Some(b'\n') {
631 self.consume_char()?;
632 }
633 Ok(())
634 }
635 Some(b'\n') => {
636 self.consume_char()?;
637 Ok(())
638 }
639 _ => Err(ParseError::SyntaxError {
640 position: self.position,
641 message: "Expected newline".to_string(),
642 }),
643 }
644 }
645
646 pub fn peek_byte(&mut self) -> ParseResult<u8> {
649 match self.peek_char()? {
650 Some(b) => Ok(b),
651 None => Err(ParseError::UnexpectedToken {
652 expected: "byte".to_string(),
653 found: "EOF".to_string(),
654 }),
655 }
656 }
657
658 pub fn read_byte(&mut self) -> ParseResult<u8> {
660 match self.consume_char()? {
661 Some(b) => Ok(b),
662 None => Err(ParseError::UnexpectedToken {
663 expected: "byte".to_string(),
664 found: "EOF".to_string(),
665 }),
666 }
667 }
668
669 pub fn seek(&mut self, pos: u64) -> ParseResult<()>
671 where
672 R: Seek,
673 {
674 self.reader.seek(SeekFrom::Start(pos))?;
675 self.position = pos as usize;
676 Ok(())
677 }
678
679 pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
680 let mut bytes = Vec::with_capacity(n);
681
682 if self.peek_buffer.is_some() && n > 0 {
684 if let Some(byte) = self.consume_char()? {
685 bytes.push(byte);
686 }
687 }
688
689 let remaining = n - bytes.len();
691 if remaining > 0 {
692 let mut rest = vec![0u8; remaining];
693 self.reader.read_exact(&mut rest)?;
694 self.position += remaining;
695 bytes.extend_from_slice(&rest);
696 }
697
698 Ok(bytes)
699 }
700
701 pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
703 let mut result = Vec::new();
704 let mut match_pos = 0;
705
706 while let Some(ch) = self.consume_char()? {
707 result.push(ch);
708
709 if ch == sequence[match_pos] {
710 match_pos += 1;
711 if match_pos == sequence.len() {
712 result.truncate(result.len() - sequence.len());
714 break;
715 }
716 } else if ch == sequence[0] {
717 match_pos = 1;
718 } else {
719 match_pos = 0;
720 }
721 }
722
723 if match_pos < sequence.len() {
724 return Err(ParseError::SyntaxError {
725 position: self.position,
726 message: format!("Sequence {sequence:?} not found"),
727 });
728 }
729
730 Ok(result)
731 }
732
733 pub fn position(&self) -> usize {
735 self.position
736 }
737
738 pub fn push_token(&mut self, token: Token) {
740 self.token_buffer.push(token);
741 }
742
743 pub fn expect_keyword(&mut self, keyword: &str) -> ParseResult<()> {
745 let token = self.next_token()?;
746 match (keyword, &token) {
747 ("endstream", Token::EndStream) => Ok(()),
748 ("stream", Token::Stream) => Ok(()),
749 ("endobj", Token::EndObj) => Ok(()),
750 ("obj", Token::Obj) => Ok(()),
751 ("startxref", Token::StartXRef) => Ok(()),
752 _ => Err(ParseError::UnexpectedToken {
753 expected: format!("keyword '{keyword}'"),
754 found: format!("{token:?}"),
755 }),
756 }
757 }
758
759 pub fn find_keyword_ahead(
762 &mut self,
763 keyword: &str,
764 max_bytes: usize,
765 ) -> ParseResult<Option<usize>>
766 where
767 R: Seek,
768 {
769 use std::io::{Read, Seek, SeekFrom};
770
771 let current_pos = self.reader.stream_position()?;
773 let start_buffer_state = self.peek_buffer;
774
775 let keyword_bytes = keyword.as_bytes();
776 let mut bytes_read = 0;
777 let mut match_buffer = Vec::new();
778
779 if let Some(buffered) = start_buffer_state {
786 bytes_read = 1;
787 match_buffer.push(buffered);
788 if match_buffer.len() == keyword_bytes.len() && match_buffer == keyword_bytes {
789 self.reader.seek(SeekFrom::Start(current_pos))?;
790 self.peek_buffer = start_buffer_state;
791 return Ok(Some(bytes_read - keyword_bytes.len()));
792 }
793 }
794
795 while bytes_read < max_bytes {
797 let mut byte = [0u8; 1];
798 match self.reader.read_exact(&mut byte) {
799 Ok(_) => {
800 bytes_read += 1;
801 match_buffer.push(byte[0]);
802
803 if match_buffer.len() > keyword_bytes.len() {
805 match_buffer.remove(0);
806 }
807
808 if match_buffer.len() == keyword_bytes.len() && match_buffer == keyword_bytes {
810 self.reader.seek(SeekFrom::Start(current_pos))?;
812 self.peek_buffer = start_buffer_state;
813 return Ok(Some(bytes_read - keyword_bytes.len()));
814 }
815 }
816 Err(_) => break, }
818 }
819
820 self.reader.seek(SeekFrom::Start(current_pos))?;
822 self.peek_buffer = start_buffer_state;
823 Ok(None)
824 }
825
826 pub fn peek_ahead(&mut self, n: usize) -> ParseResult<Vec<u8>>
828 where
829 R: Seek,
830 {
831 use std::io::{Read, Seek, SeekFrom};
832
833 let current_pos = self.reader.stream_position()?;
835 let start_buffer_state = self.peek_buffer;
836
837 let mut bytes = vec![0u8; n];
839 let bytes_read = self.reader.read(&mut bytes)?;
840 bytes.truncate(bytes_read);
841
842 self.reader.seek(SeekFrom::Start(current_pos))?;
844 self.peek_buffer = start_buffer_state;
845
846 Ok(bytes)
847 }
848
849 pub fn save_position(&mut self) -> ParseResult<(u64, Option<u8>)>
851 where
852 R: Seek,
853 {
854 use std::io::Seek;
855 let pos = self.reader.stream_position()?;
856 Ok((pos, self.peek_buffer))
857 }
858
859 pub fn restore_position(&mut self, saved: (u64, Option<u8>)) -> ParseResult<()>
861 where
862 R: Seek,
863 {
864 use std::io::{Seek, SeekFrom};
865 self.reader.seek(SeekFrom::Start(saved.0))?;
866 self.peek_buffer = saved.1;
867 self.position = saved.0 as usize;
868 Ok(())
869 }
870
871 pub fn peek_token(&mut self) -> ParseResult<Token>
879 where
880 R: Seek,
881 {
882 let saved_pos = self.save_position()?;
883 let result = self.next_token();
884 self.restore_position(saved_pos)?;
885 result
886 }
887
888 fn process_string_with_encoding_recovery(
890 &mut self,
891 string_bytes: &[u8],
892 ) -> ParseResult<Vec<u8>> {
893 use super::encoding::{CharacterDecoder, EncodingOptions, EncodingType, EnhancedDecoder};
894
895 let has_problematic_chars = string_bytes.iter().any(|&b| {
897 (0x80..=0x9F).contains(&b)
899 || b == 0x07
900 || (b <= 0x1F && b != 0x09 && b != 0x0A && b != 0x0D)
901 });
902
903 let decoder = EnhancedDecoder::new();
904
905 let encoding_options = if has_problematic_chars {
907 EncodingOptions {
908 lenient_mode: true, preferred_encoding: Some(EncodingType::Windows1252), max_replacements: std::cmp::max(100, string_bytes.len() / 10), log_issues: self.options.collect_warnings,
912 }
913 } else {
914 EncodingOptions {
915 lenient_mode: self.options.lenient_encoding,
916 preferred_encoding: self.options.preferred_encoding,
917 max_replacements: 50,
918 log_issues: self.options.collect_warnings,
919 }
920 };
921
922 match decoder.decode(string_bytes, &encoding_options) {
923 Ok(result) => {
924 if (result.replacement_count > 0 || has_problematic_chars)
926 && self.options.collect_warnings
927 {
928 self.warnings.push(ParseWarning::InvalidEncoding {
929 position: self.position,
930 recovered_text: if result.text.len() > 50 {
931 let truncate_at = result
933 .text
934 .char_indices()
935 .map(|(i, _)| i)
936 .nth(47)
937 .unwrap_or_else(|| {
938 let limit = result.text.len().min(47);
940 let mut pos = limit;
941 while pos > 0 && !result.text.is_char_boundary(pos) {
942 pos -= 1;
943 }
944 pos
945 });
946
947 let safe_text = if truncate_at <= result.text.len()
949 && result.text.is_char_boundary(truncate_at)
950 {
951 result.text[..truncate_at].to_string()
952 } else {
953 result.text.chars().take(47).collect::<String>()
955 };
956
957 format!(
958 "{}... (truncated, {} chars total)",
959 safe_text,
960 result.text.chars().count()
961 )
962 } else {
963 result.text.clone()
964 },
965 encoding_used: result.detected_encoding,
966 replacement_count: result.replacement_count,
967 });
968 }
969
970 Ok(result.text.into_bytes())
972 }
973 Err(encoding_error) => {
974 if self.options.lenient_encoding {
975 let fallback_result = self.apply_fallback_encoding_strategy(string_bytes);
977
978 if self.options.collect_warnings {
979 self.warnings.push(ParseWarning::InvalidEncoding {
980 position: self.position,
981 recovered_text: format!(
982 "Fallback strategy applied: {} -> {} chars",
983 string_bytes.len(),
984 fallback_result.len()
985 ),
986 encoding_used: None,
987 replacement_count: string_bytes.len(),
988 });
989 }
990 Ok(fallback_result)
991 } else {
992 Err(ParseError::CharacterEncodingError {
993 position: self.position,
994 message: format!(
995 "Failed to decode string with any supported encoding: {encoding_error}"
996 ),
997 })
998 }
999 }
1000 }
1001 }
1002
1003 fn apply_fallback_encoding_strategy(&self, string_bytes: &[u8]) -> Vec<u8> {
1005 let mut result = Vec::with_capacity(string_bytes.len());
1006
1007 for &byte in string_bytes {
1008 match byte {
1009 0x00..=0x08 | 0x0B | 0x0C | 0x0E..=0x1F => {
1011 result.push(b' '); }
1013 0x80..=0x9F => {
1014 let replacement = match byte {
1016 0x80 => b'E', 0x81 => b' ', 0x82 => b',', 0x83 => b'f', 0x84 => b'"', 0x85 => b'.', 0x86 => b'+', 0x87 => b'+', 0x88 => b'^', 0x89 => b'%', 0x8A => b'S', 0x8B => b'<', 0x8C => b'O', 0x8D => b' ', 0x8E => b'Z', 0x8F => b' ', 0x90 => b' ', 0x91 => b'\'', 0x92 => b'\'', 0x93 => b'"', 0x94 => b'"', 0x95 => b'*', 0x96 => b'-', 0x97 => b'-', 0x98 => b'~', 0x99 => b'T', 0x9A => b's', 0x9B => b'>', 0x9C => b'o', 0x9D => b' ', 0x9E => b'z', 0x9F => b'Y', _ => b'?', };
1050 result.push(replacement);
1051 }
1052 _ => {
1053 result.push(byte); }
1055 }
1056 }
1057
1058 result
1059 }
1060
1061 fn is_problematic_encoding_char(&self, ch: u8) -> bool {
1063 (0x80..=0x9F).contains(&ch) ||
1065 ch == 0x07 || (ch <= 0x1F && ch != 0x09 && ch != 0x0A && ch != 0x0D) || (self.options.lenient_syntax && ch >= 0xA0) }
1070
1071 fn handle_encoding_char_in_token_stream(&mut self, ch: u8) -> ParseResult<Token> {
1073 if self.options.lenient_encoding {
1074 self.consume_char()?;
1076
1077 if self.options.collect_warnings {
1079 let replacement_char = match ch {
1080 0x07 => "bell",
1081 0x00..=0x1F => "control",
1082 0x80..=0x9F => "latin1-supplement",
1083 _ => "unknown",
1084 };
1085
1086 self.warnings.push(ParseWarning::InvalidEncoding {
1087 position: self.position,
1088 recovered_text: format!(
1089 "Skipped problematic {replacement_char} character (0x{ch:02X})"
1090 ),
1091 encoding_used: None,
1092 replacement_count: 1,
1093 });
1094 }
1095
1096 self.skip_whitespace()?;
1098 if let Ok(Some(_)) = self.peek_char() {
1099 self.next_token() } else {
1101 Err(ParseError::SyntaxError {
1102 position: self.position,
1103 message: "Unexpected end of file after problematic character".to_string(),
1104 })
1105 }
1106 } else {
1107 let char_description = match ch {
1109 0x07 => "Bell character (\\u{07})".to_string(),
1110 0x00..=0x1F => format!("Control character (\\u{{{ch:02X}}})"),
1111 0x80..=0x9F => format!("Latin-1 supplement character (\\u{{{ch:02X}}})"),
1112 _ => format!("Problematic character (\\u{{{ch:02X}}})"),
1113 };
1114
1115 Err(ParseError::CharacterEncodingError {
1116 position: self.position,
1117 message: format!(
1118 "Unexpected character: {char_description} - Consider using lenient parsing mode"
1119 ),
1120 })
1121 }
1122 }
1123}
1124
1125#[cfg(test)]
1126mod tests {
1127 use super::*;
1128 use std::io::Cursor;
1129
1130 #[test]
1131 fn test_lexer_basic_tokens() {
1132 let input = b"123 -456 3.14 true false null /Name";
1134 let mut lexer = Lexer::new(Cursor::new(input));
1135
1136 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1137 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
1138 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1139 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1140 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1141 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1142 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1143 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1144 }
1145
1146 #[test]
1147 fn test_lexer_negative_numbers() {
1148 let input = b"-123 -45.67";
1150 let mut lexer = Lexer::new(Cursor::new(input));
1151
1152 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
1153 assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
1154 }
1155
1156 #[test]
1157 fn test_lexer_strings() {
1158 let input = b"(Hello World) <48656C6C6F>";
1159 let mut lexer = Lexer::new(Cursor::new(input));
1160
1161 assert_eq!(
1162 lexer.next_token().unwrap(),
1163 Token::String(b"Hello World".to_vec())
1164 );
1165 assert_eq!(
1166 lexer.next_token().unwrap(),
1167 Token::String(b"Hello".to_vec())
1168 );
1169 }
1170
1171 #[test]
1172 fn test_lexer_dictionaries() {
1173 let input = b"<< /Type /Page >>";
1174 let mut lexer = Lexer::new(Cursor::new(input));
1175
1176 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1177 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1178 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1179 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1180 }
1181
1182 #[test]
1183 fn test_lexer_arrays() {
1184 let input = b"[1 2 3]";
1185 let mut lexer = Lexer::new(Cursor::new(input));
1186
1187 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1188 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1189 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1190 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1191 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1192 }
1193
1194 #[test]
1195 fn test_lexer_references() {
1196 let input = b"1 0 R 25 1 R";
1197 let mut lexer = Lexer::new(Cursor::new(input));
1198
1199 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1201 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1202 match lexer.next_token().unwrap() {
1204 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1206 }
1207
1208 assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
1209 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1210 match lexer.next_token().unwrap() {
1211 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1213 }
1214 }
1215
1216 #[test]
1217 fn test_lexer_comments() {
1218 let input = b"%PDF-1.7\n123";
1219 let mut lexer = Lexer::new(Cursor::new(input));
1220
1221 assert_eq!(
1222 lexer.next_token().unwrap(),
1223 Token::Comment("PDF-1.7".to_string())
1224 );
1225 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1226 }
1227
1228 mod comprehensive_tests {
1230 use super::*;
1231 use std::io::Cursor;
1232
1233 #[test]
1234 fn test_token_debug_trait() {
1235 let token = Token::Integer(42);
1236 let debug_str = format!("{token:?}");
1237 assert!(debug_str.contains("Integer"));
1238 assert!(debug_str.contains("42"));
1239 }
1240
1241 #[test]
1242 fn test_token_clone() {
1243 let token = Token::String(b"hello".to_vec());
1244 let cloned = token.clone();
1245 assert_eq!(token, cloned);
1246 }
1247
1248 #[test]
1249 fn test_token_equality() {
1250 assert_eq!(Token::Integer(42), Token::Integer(42));
1251 assert_ne!(Token::Integer(42), Token::Integer(43));
1252 assert_eq!(Token::Boolean(true), Token::Boolean(true));
1253 assert_ne!(Token::Boolean(true), Token::Boolean(false));
1254 assert_eq!(Token::Null, Token::Null);
1255 assert_ne!(Token::Null, Token::Integer(0));
1256 }
1257
1258 #[test]
1259 fn test_lexer_empty_input() {
1260 let input = b"";
1261 let mut lexer = Lexer::new(Cursor::new(input));
1262 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1263 }
1264
1265 #[test]
1266 fn test_lexer_whitespace_only() {
1267 let input = b" \t\n\r ";
1268 let mut lexer = Lexer::new(Cursor::new(input));
1269 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1270 }
1271
1272 #[test]
1273 fn test_lexer_integer_edge_cases() {
1274 let input = b"0 +123 -0 9876543210";
1275 let mut lexer = Lexer::new(Cursor::new(input));
1276
1277 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1278 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1279 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1280 assert_eq!(lexer.next_token().unwrap(), Token::Integer(9876543210));
1281 }
1282
1283 #[test]
1284 fn test_lexer_real_edge_cases() {
1285 let input = b"0.0 +3.14 -2.71828 .5 5. 123.456789";
1286 let mut lexer = Lexer::new(Cursor::new(input));
1287
1288 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.0));
1289 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1290 assert_eq!(lexer.next_token().unwrap(), Token::Real(-2.71828));
1291 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.5));
1292 assert_eq!(lexer.next_token().unwrap(), Token::Real(5.0));
1293 assert_eq!(lexer.next_token().unwrap(), Token::Real(123.456789));
1294 }
1295
1296 #[test]
1297 fn test_lexer_scientific_notation() {
1298 let input = b"1.23e10 -4.56E-5 1e0 2E+3";
1299 let mut lexer = Lexer::new(Cursor::new(input));
1300
1301 assert_eq!(lexer.next_token().unwrap(), Token::Real(1.23e10));
1302 assert_eq!(lexer.next_token().unwrap(), Token::Real(-4.56e-5));
1303 assert_eq!(lexer.next_token().unwrap(), Token::Real(1e0));
1304 assert_eq!(lexer.next_token().unwrap(), Token::Real(2e3));
1305 }
1306
1307 #[test]
1308 fn test_lexer_string_literal_escapes() {
1309 let input = b"(Hello\\nWorld) (Tab\\tChar) (Quote\\\"Mark) (Backslash\\\\)";
1310 let mut lexer = Lexer::new(Cursor::new(input));
1311
1312 assert_eq!(
1313 lexer.next_token().unwrap(),
1314 Token::String(b"Hello\nWorld".to_vec())
1315 );
1316 assert_eq!(
1317 lexer.next_token().unwrap(),
1318 Token::String(b"Tab\tChar".to_vec())
1319 );
1320 assert_eq!(
1321 lexer.next_token().unwrap(),
1322 Token::String(b"Quote\"Mark".to_vec())
1323 );
1324 assert_eq!(
1325 lexer.next_token().unwrap(),
1326 Token::String(b"Backslash\\".to_vec())
1327 );
1328 }
1329
1330 #[test]
1331 fn test_lexer_string_literal_nested_parens() {
1332 let input = b"(Nested (parentheses) work)";
1333 let mut lexer = Lexer::new(Cursor::new(input));
1334
1335 assert_eq!(
1336 lexer.next_token().unwrap(),
1337 Token::String(b"Nested (parentheses) work".to_vec())
1338 );
1339 }
1340
1341 #[test]
1342 fn test_lexer_string_literal_empty() {
1343 let input = b"()";
1344 let mut lexer = Lexer::new(Cursor::new(input));
1345
1346 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1347 }
1348
1349 #[test]
1350 fn test_lexer_hexadecimal_strings() {
1351 let input = b"<48656C6C6F> <20576F726C64> <>";
1352 let mut lexer = Lexer::new(Cursor::new(input));
1353
1354 assert_eq!(
1355 lexer.next_token().unwrap(),
1356 Token::String(b"Hello".to_vec())
1357 );
1358 assert_eq!(
1359 lexer.next_token().unwrap(),
1360 Token::String(b" World".to_vec())
1361 );
1362 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1363 }
1364
1365 #[test]
1366 fn test_lexer_hexadecimal_strings_odd_length() {
1367 let input = b"<48656C6C6F2> <1> <ABC>";
1368 let mut lexer = Lexer::new(Cursor::new(input));
1369
1370 assert_eq!(
1372 lexer.next_token().unwrap(),
1373 Token::String(b"Hello ".to_vec())
1374 );
1375 assert_eq!(lexer.next_token().unwrap(), Token::String(b"\x10".to_vec()));
1376 assert_eq!(
1377 lexer.next_token().unwrap(),
1378 Token::String(b"\xAB\xC0".to_vec())
1379 );
1380 }
1381
1382 #[test]
1383 fn test_lexer_hexadecimal_strings_whitespace() {
1384 let input = b"<48 65 6C 6C 6F>";
1385 let mut lexer = Lexer::new(Cursor::new(input));
1386
1387 assert_eq!(
1388 lexer.next_token().unwrap(),
1389 Token::String(b"Hello".to_vec())
1390 );
1391 }
1392
1393 #[test]
1394 fn test_lexer_names() {
1395 let input = b"/Type /Page /Root /Kids /Count /MediaBox";
1396 let mut lexer = Lexer::new(Cursor::new(input));
1397
1398 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1399 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1400 assert_eq!(lexer.next_token().unwrap(), Token::Name("Root".to_string()));
1401 assert_eq!(lexer.next_token().unwrap(), Token::Name("Kids".to_string()));
1402 assert_eq!(
1403 lexer.next_token().unwrap(),
1404 Token::Name("Count".to_string())
1405 );
1406 assert_eq!(
1407 lexer.next_token().unwrap(),
1408 Token::Name("MediaBox".to_string())
1409 );
1410 }
1411
1412 #[test]
1413 fn test_lexer_names_with_special_chars() {
1414 let input = b"/Name#20with#20spaces /Name#2Fwith#2Fslashes";
1415 let mut lexer = Lexer::new(Cursor::new(input));
1416
1417 assert_eq!(
1418 lexer.next_token().unwrap(),
1419 Token::Name("Name with spaces".to_string())
1420 );
1421 assert_eq!(
1422 lexer.next_token().unwrap(),
1423 Token::Name("Name/with/slashes".to_string())
1424 );
1425 }
1426
1427 #[test]
1428 fn test_lexer_names_edge_cases() {
1429 let input = b"/ /A /123 /true /false /null";
1430 let mut lexer = Lexer::new(Cursor::new(input));
1431
1432 assert_eq!(lexer.next_token().unwrap(), Token::Name("".to_string()));
1433 assert_eq!(lexer.next_token().unwrap(), Token::Name("A".to_string()));
1434 assert_eq!(lexer.next_token().unwrap(), Token::Name("123".to_string()));
1435 assert_eq!(lexer.next_token().unwrap(), Token::Name("true".to_string()));
1436 assert_eq!(
1437 lexer.next_token().unwrap(),
1438 Token::Name("false".to_string())
1439 );
1440 assert_eq!(lexer.next_token().unwrap(), Token::Name("null".to_string()));
1441 }
1442
1443 #[test]
1444 fn test_lexer_nested_dictionaries() {
1445 let input = b"<< /Type /Page /Resources << /Font << /F1 123 0 R >> >> >>";
1446 let mut lexer = Lexer::new(Cursor::new(input));
1447
1448 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1449 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1450 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1451 assert_eq!(
1452 lexer.next_token().unwrap(),
1453 Token::Name("Resources".to_string())
1454 );
1455 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1456 assert_eq!(lexer.next_token().unwrap(), Token::Name("Font".to_string()));
1457 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1458 assert_eq!(lexer.next_token().unwrap(), Token::Name("F1".to_string()));
1459 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1460 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1461 assert_eq!(lexer.next_token().unwrap(), Token::Name("R".to_string()));
1462 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1463 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1464 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1465 }
1466
1467 #[test]
1468 fn test_lexer_nested_arrays() {
1469 let input = b"[[1 2] [3 4] [5 [6 7]]]";
1470 let mut lexer = Lexer::new(Cursor::new(input));
1471
1472 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1473 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1474 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1475 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1476 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1477 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1478 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1479 assert_eq!(lexer.next_token().unwrap(), Token::Integer(4));
1480 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1481 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1482 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1483 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1484 assert_eq!(lexer.next_token().unwrap(), Token::Integer(6));
1485 assert_eq!(lexer.next_token().unwrap(), Token::Integer(7));
1486 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1487 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1488 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1489 }
1490
1491 #[test]
1492 fn test_lexer_mixed_content() {
1493 let input = b"<< /Type /Page /MediaBox [0 0 612 792] /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 >> >> >> >>";
1494 let mut lexer = Lexer::new(Cursor::new(input));
1495
1496 let mut tokens = Vec::new();
1498 loop {
1499 match lexer.next_token().unwrap() {
1500 Token::Eof => break,
1501 token => tokens.push(token),
1502 }
1503 }
1504 assert!(tokens.len() > 10);
1505 }
1506
1507 #[test]
1508 fn test_lexer_keywords() {
1509 let input = b"obj endobj stream endstream startxref";
1510 let mut lexer = Lexer::new(Cursor::new(input));
1511
1512 assert_eq!(lexer.next_token().unwrap(), Token::Obj);
1513 assert_eq!(lexer.next_token().unwrap(), Token::EndObj);
1514 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1515 assert_eq!(lexer.next_token().unwrap(), Token::EndStream);
1516 assert_eq!(lexer.next_token().unwrap(), Token::StartXRef);
1517 }
1518
1519 #[test]
1520 fn test_lexer_multiple_comments() {
1521 let input = b"%First comment\n%Second comment\n123";
1522 let mut lexer = Lexer::new(Cursor::new(input));
1523
1524 assert_eq!(
1525 lexer.next_token().unwrap(),
1526 Token::Comment("First comment".to_string())
1527 );
1528 assert_eq!(
1529 lexer.next_token().unwrap(),
1530 Token::Comment("Second comment".to_string())
1531 );
1532 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1533 }
1534
1535 #[test]
1536 fn test_lexer_comment_without_newline() {
1537 let input = b"%Comment at end";
1538 let mut lexer = Lexer::new(Cursor::new(input));
1539
1540 assert_eq!(
1541 lexer.next_token().unwrap(),
1542 Token::Comment("Comment at end".to_string())
1543 );
1544 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1545 }
1546
1547 #[test]
1548 fn test_lexer_special_characters_in_streams() {
1549 let input = b"<< /Length 5 >> stream\nHello endstream";
1550 let mut lexer = Lexer::new(Cursor::new(input));
1551
1552 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1553 assert_eq!(
1554 lexer.next_token().unwrap(),
1555 Token::Name("Length".to_string())
1556 );
1557 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1558 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1559 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1560 }
1562
1563 #[test]
1564 fn test_lexer_push_token() {
1565 let input = b"123 456";
1566 let mut lexer = Lexer::new(Cursor::new(input));
1567
1568 let token1 = lexer.next_token().unwrap();
1569 assert_eq!(token1, Token::Integer(123));
1570
1571 let token2 = lexer.next_token().unwrap();
1572 assert_eq!(token2, Token::Integer(456));
1573
1574 lexer.push_token(token2.clone());
1576
1577 let token3 = lexer.next_token().unwrap();
1579 assert_eq!(token3, token2);
1580
1581 let token4 = lexer.next_token().unwrap();
1583 assert_eq!(token4, Token::Eof);
1584 }
1585
1586 #[test]
1587 fn test_lexer_push_multiple_tokens() {
1588 let input = b"123";
1589 let mut lexer = Lexer::new(Cursor::new(input));
1590
1591 let original_token = lexer.next_token().unwrap();
1592 assert_eq!(original_token, Token::Integer(123));
1593
1594 lexer.push_token(Token::Boolean(true));
1596 lexer.push_token(Token::Boolean(false));
1597 lexer.push_token(Token::Null);
1598
1599 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1601 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1602 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1603 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1604 }
1605
1606 #[test]
1607 fn test_lexer_read_newline() {
1608 let input = b"123\n456\r\n789";
1609 let mut lexer = Lexer::new(Cursor::new(input));
1610
1611 let digits1 = lexer.read_digits().unwrap();
1613 assert_eq!(digits1, "123");
1614 assert!(lexer.read_newline().is_ok());
1615
1616 let digits2 = lexer.read_digits().unwrap();
1618 assert_eq!(digits2, "456");
1619 assert!(lexer.read_newline().is_ok());
1620
1621 let digits3 = lexer.read_digits().unwrap();
1623 assert_eq!(digits3, "789");
1624 }
1625
1626 #[test]
1627 fn test_lexer_read_bytes() {
1628 let input = b"Hello World";
1629 let mut lexer = Lexer::new(Cursor::new(input));
1630
1631 let bytes = lexer.read_bytes(5).unwrap();
1632 assert_eq!(bytes, b"Hello");
1633
1634 let bytes = lexer.read_bytes(6).unwrap();
1635 assert_eq!(bytes, b" World");
1636 }
1637
1638 #[test]
1639 fn test_lexer_read_until_sequence() {
1640 let input = b"Hello endstream World";
1641 let mut lexer = Lexer::new(Cursor::new(input));
1642
1643 let result = lexer.read_until_sequence(b"endstream").unwrap();
1644 assert_eq!(result, b"Hello ");
1645
1646 let rest = lexer.read_digits().unwrap();
1648 assert_eq!(rest, ""); }
1650
1651 #[test]
1652 fn test_lexer_read_until_sequence_not_found() {
1653 let input = b"Hello World";
1654 let mut lexer = Lexer::new(Cursor::new(input));
1655
1656 let result = lexer.read_until_sequence(b"notfound");
1657 assert!(result.is_err());
1658 }
1659
1660 #[test]
1661 fn test_lexer_position_tracking() {
1662 let input = b"123 456";
1663 let mut lexer = Lexer::new(Cursor::new(input));
1664
1665 let initial_pos = lexer.position();
1666 assert_eq!(initial_pos, 0);
1667
1668 lexer.next_token().unwrap(); let pos_after_first = lexer.position();
1670 assert!(pos_after_first > initial_pos);
1671
1672 lexer.next_token().unwrap(); let pos_after_second = lexer.position();
1674 assert!(pos_after_second > pos_after_first);
1675 }
1676
1677 #[test]
1678 fn test_lexer_large_numbers() {
1679 let input = b"2147483647 -2147483648 9223372036854775807 -9223372036854775808";
1680 let mut lexer = Lexer::new(Cursor::new(input));
1681
1682 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2147483647));
1683 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-2147483648));
1684 assert_eq!(
1685 lexer.next_token().unwrap(),
1686 Token::Integer(9223372036854775807)
1687 );
1688 assert_eq!(
1689 lexer.next_token().unwrap(),
1690 Token::Integer(-9223372036854775808)
1691 );
1692 }
1693
1694 #[test]
1695 fn test_lexer_very_long_string() {
1696 let long_str = "A".repeat(1000);
1697 let input = format!("({long_str})");
1698 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1699
1700 if let Token::String(s) = lexer.next_token().unwrap() {
1701 assert_eq!(s.len(), 1000);
1702 assert_eq!(s, long_str.as_bytes());
1703 } else {
1704 panic!("Expected string token");
1705 }
1706 }
1707
1708 #[test]
1709 fn test_lexer_very_long_name() {
1710 let long_name = "A".repeat(500);
1711 let input = format!("/{long_name}");
1712 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1713
1714 if let Token::Name(name) = lexer.next_token().unwrap() {
1715 assert_eq!(name.len(), 500);
1716 assert_eq!(name, long_name);
1717 } else {
1718 panic!("Expected name token");
1719 }
1720 }
1721
1722 #[test]
1723 fn test_lexer_error_handling_invalid_hex() {
1724 let input = b"<48656C6C6FG>";
1725 let mut lexer = Lexer::new(Cursor::new(input));
1726
1727 let result = lexer.next_token();
1729 assert!(result.is_ok() || result.is_err()); }
1731
1732 #[test]
1733 fn test_lexer_all_token_types() {
1734 let input = b"true false null 123 -456 3.14 (string) <48656C6C6F> /Name [ ] << >> obj endobj stream endstream startxref % comment\n";
1735 let mut lexer = Lexer::new(Cursor::new(input));
1736
1737 let mut token_types = Vec::new();
1738 loop {
1739 match lexer.next_token().unwrap() {
1740 Token::Eof => break,
1741 token => token_types.push(std::mem::discriminant(&token)),
1742 }
1743 }
1744
1745 assert!(token_types.len() > 10);
1747 }
1748
1749 #[test]
1750 fn test_lexer_performance() {
1751 let input = "123 456 789 ".repeat(1000);
1752 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1753
1754 let start_time = std::time::Instant::now();
1755 let mut count = 0;
1756 loop {
1757 match lexer.next_token().unwrap() {
1758 Token::Eof => break,
1759 _ => count += 1,
1760 }
1761 }
1762 let elapsed = start_time.elapsed();
1763
1764 assert_eq!(count, 3000); assert!(elapsed.as_millis() < 1000); }
1767 }
1768
1769 #[test]
1770 fn test_lexer_find_keyword_ahead() {
1771 let input = b"some data here endstream more data";
1772 let mut lexer = Lexer::new(Cursor::new(input));
1773
1774 let result = lexer.find_keyword_ahead("endstream", 100);
1776 assert!(result.is_ok());
1777 assert_eq!(result.unwrap(), Some(15)); let result2 = lexer.find_keyword_ahead("notfound", 100);
1781 assert!(result2.is_ok());
1782 assert_eq!(result2.unwrap(), None);
1783
1784 let result3 = lexer.find_keyword_ahead("endstream", 10);
1786 assert!(result3.is_ok());
1787 assert_eq!(result3.unwrap(), None); }
1789
1790 #[test]
1791 fn test_lexer_peek_token() {
1792 let input = b"123 456 /Name";
1793 let mut lexer = Lexer::new(Cursor::new(input));
1794
1795 let peeked = lexer.peek_token();
1797 assert!(peeked.is_ok());
1798 assert_eq!(peeked.unwrap(), Token::Integer(123));
1799
1800 let next = lexer.next_token();
1802 assert!(next.is_ok());
1803 assert_eq!(next.unwrap(), Token::Integer(123));
1804
1805 assert_eq!(lexer.peek_token().unwrap(), Token::Integer(456));
1807 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1808
1809 assert_eq!(lexer.peek_token().unwrap(), Token::Name("Name".to_string()));
1810 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1811 }
1812
1813 #[test]
1814 fn test_lexer_expect_keyword() {
1815 let input = b"endstream obj endobj";
1816 let mut lexer = Lexer::new(Cursor::new(input));
1817
1818 assert!(lexer.expect_keyword("endstream").is_ok());
1820
1821 assert!(lexer.expect_keyword("obj").is_ok());
1823
1824 let result = lexer.expect_keyword("stream");
1826 assert!(result.is_err());
1827 match result {
1828 Err(ParseError::UnexpectedToken { expected, found }) => {
1829 assert!(expected.contains("stream"));
1830 assert!(found.contains("EndObj"));
1831 }
1832 _ => panic!("Expected UnexpectedToken error"),
1833 }
1834 }
1835
1836 #[test]
1837 fn test_lexer_save_restore_position() {
1838 let input = b"123 456 789";
1839 let mut lexer = Lexer::new(Cursor::new(input));
1840
1841 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1843
1844 let saved = lexer.save_position();
1846 assert!(saved.is_ok());
1847 let saved_pos = saved.unwrap();
1848
1849 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1851 assert_eq!(lexer.next_token().unwrap(), Token::Integer(789));
1852
1853 assert!(lexer.restore_position(saved_pos).is_ok());
1855
1856 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1858 }
1859
1860 #[test]
1861 fn test_lexer_character_encoding_recovery() {
1862 let input = b"(Caf\x80 \x91Hello\x92)"; let options = ParseOptions::lenient();
1865 let mut lexer = Lexer::new_with_options(Cursor::new(input), options);
1866
1867 match lexer.next_token().unwrap() {
1868 Token::String(bytes) => {
1869 let text = String::from_utf8_lossy(&bytes);
1871 tracing::debug!("Recovered text: {text}");
1872 assert!(!text.is_empty()); }
1874 other => panic!("Expected String token, got {other:?}"),
1875 }
1876
1877 let warnings = lexer.warnings();
1879 if !warnings.is_empty() {
1880 tracing::debug!("Encoding warnings: {warnings:?}");
1881 }
1882 }
1883
1884 fn lexer_no_encoding(data: &[u8]) -> Lexer<Cursor<&[u8]>> {
1886 let mut opts = ParseOptions::default();
1887 opts.lenient_encoding = false;
1888 Lexer::new_with_options(Cursor::new(data), opts)
1889 }
1890
1891 #[test]
1892 fn test_lexer_octal_escape_overflow_777_raw() {
1893 let mut lexer = lexer_no_encoding(b"(\\777)");
1897 match lexer.next_token().unwrap() {
1898 Token::String(bytes) => assert_eq!(bytes, vec![0xFF]),
1899 other => panic!("Expected String token, got {other:?}"),
1900 }
1901 }
1902
1903 #[test]
1904 fn test_lexer_octal_escape_overflow_400_raw() {
1905 let mut lexer = lexer_no_encoding(b"(\\400)");
1907 match lexer.next_token().unwrap() {
1908 Token::String(bytes) => assert_eq!(bytes, vec![0x00]),
1909 other => panic!("Expected String token, got {other:?}"),
1910 }
1911 }
1912
1913 #[test]
1914 fn test_lexer_octal_escape_max_valid_377_raw() {
1915 let mut lexer = lexer_no_encoding(b"(\\377)");
1917 match lexer.next_token().unwrap() {
1918 Token::String(bytes) => assert_eq!(bytes, vec![0xFF]),
1919 other => panic!("Expected String token, got {other:?}"),
1920 }
1921 }
1922
1923 #[test]
1924 fn test_lexer_octal_escape_overflow_mixed_raw() {
1925 let mut lexer = lexer_no_encoding(b"(A\\777B\\101C)");
1927 match lexer.next_token().unwrap() {
1928 Token::String(bytes) => {
1929 assert_eq!(bytes, vec![b'A', 0xFF, b'B', b'A', b'C']);
1930 }
1931 other => panic!("Expected String token, got {other:?}"),
1932 }
1933 }
1934
1935 #[test]
1936 fn test_lexer_octal_escape_overflow_no_panic_with_encoding() {
1937 let mut lexer = Lexer::new(Cursor::new(b"(\\777\\400\\577)" as &[u8]));
1939 match lexer.next_token().unwrap() {
1940 Token::String(bytes) => {
1941 assert!(!bytes.is_empty());
1944 }
1945 other => panic!("Expected String token, got {other:?}"),
1946 }
1947 }
1948}