1use super::{ParseError, ParseOptions, ParseResult, ParseWarning};
6use std::io::{Read, Seek, SeekFrom};
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Boolean(bool),
13
14 Integer(i64),
16
17 Real(f64),
19
20 String(Vec<u8>),
22
23 Name(String),
25
26 ArrayStart,
28
29 ArrayEnd,
31
32 DictStart,
34
35 DictEnd,
37
38 Stream,
40
41 EndStream,
43
44 Obj,
46
47 EndObj,
49
50 StartXRef,
52
53 Reference(u32, u16),
55
56 Null,
58
59 Comment(String),
61
62 Eof,
64}
65
66pub struct Lexer<R> {
68 reader: std::io::BufReader<R>,
69 #[allow(dead_code)]
70 buffer: Vec<u8>,
71 position: usize,
72 peek_buffer: Option<u8>,
73 token_buffer: Vec<Token>,
74 options: ParseOptions,
75 warnings: Vec<ParseWarning>,
76}
77
78impl<R: Read> Lexer<R> {
79 pub fn new(reader: R) -> Self {
81 Self::new_with_options(reader, ParseOptions::default())
82 }
83
84 pub fn new_with_options(reader: R, options: ParseOptions) -> Self {
86 Self {
87 reader: std::io::BufReader::new(reader),
88 buffer: Vec::with_capacity(1024),
89 position: 0,
90 peek_buffer: None,
91 token_buffer: Vec::new(),
92 options,
93 warnings: Vec::new(),
94 }
95 }
96
97 pub fn warnings(&self) -> &[ParseWarning] {
99 &self.warnings
100 }
101
102 pub fn next_token(&mut self) -> ParseResult<Token> {
104 if let Some(token) = self.token_buffer.pop() {
106 return Ok(token);
107 }
108
109 self.skip_whitespace()?;
110
111 let ch = match self.peek_char()? {
112 Some(ch) => ch,
113 None => return Ok(Token::Eof),
114 };
115
116 match ch {
117 b'%' => self.read_comment(),
118 b'/' => self.read_name(),
119 b'(' => self.read_literal_string(),
120 b'<' => self.read_angle_bracket(),
121 b'>' => {
122 self.consume_char()?;
123 if self.peek_char()? == Some(b'>') {
124 self.consume_char()?;
125 Ok(Token::DictEnd)
126 } else {
127 Err(ParseError::SyntaxError {
128 position: self.position,
129 message: "Expected '>' after '>'".to_string(),
130 })
131 }
132 }
133 b'[' => {
134 self.consume_char()?;
135 Ok(Token::ArrayStart)
136 }
137 b']' => {
138 self.consume_char()?;
139 Ok(Token::ArrayEnd)
140 }
141 b't' | b'f' => self.read_boolean(),
142 b'n' => self.read_null(),
143 b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
144 b'R' => {
145 self.consume_char()?;
147 Ok(Token::Name("R".to_string()))
148 }
149 _ if ch.is_ascii_alphabetic() => self.read_keyword(),
150 b';' => {
151 self.consume_char()?;
153 self.next_token() }
155 _ => {
156 if self.is_problematic_encoding_char(ch) {
158 self.handle_encoding_char_in_token_stream(ch)
159 } else if self.options.lenient_syntax {
160 if self.options.collect_warnings {
162 tracing::debug!(
163 "Warning: Skipping unexpected character '{}' at position {}",
164 ch as char,
165 self.position
166 );
167 }
168 self.consume_char()?;
169 self.next_token() } else {
171 Err(ParseError::SyntaxError {
172 position: self.position,
173 message: format!("Unexpected character: {}", ch as char),
174 })
175 }
176 }
177 }
178 }
179
180 fn peek_char(&mut self) -> ParseResult<Option<u8>> {
182 if let Some(ch) = self.peek_buffer {
183 return Ok(Some(ch));
184 }
185
186 let mut buf = [0u8; 1];
187 match self.reader.read_exact(&mut buf) {
188 Ok(_) => {
189 self.peek_buffer = Some(buf[0]);
190 Ok(Some(buf[0]))
191 }
192 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
193 Err(e) => Err(e.into()),
194 }
195 }
196
197 fn consume_char(&mut self) -> ParseResult<Option<u8>> {
199 let ch = self.peek_char()?;
200 if ch.is_some() {
201 self.peek_buffer = None;
202 self.position += 1;
203 }
204 Ok(ch)
205 }
206
207 pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
209 let mut count = 0;
210 while let Some(ch) = self.peek_char()? {
211 if ch.is_ascii_whitespace() {
212 self.consume_char()?;
213 count += 1;
214 } else {
215 break;
216 }
217 }
218 Ok(count)
219 }
220
221 fn read_comment(&mut self) -> ParseResult<Token> {
223 self.consume_char()?; let mut comment = String::new();
225
226 while let Some(ch) = self.peek_char()? {
227 if ch == b'\n' || ch == b'\r' {
228 break;
229 }
230 self.consume_char()?;
231 comment.push(ch as char);
232 }
233
234 Ok(Token::Comment(comment))
235 }
236
237 fn read_name(&mut self) -> ParseResult<Token> {
239 self.consume_char()?; let mut name = String::new();
241
242 while let Some(ch) = self.peek_char()? {
243 if ch.is_ascii_whitespace()
244 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
245 {
246 break;
247 }
248 self.consume_char()?;
249
250 if ch == b'#' {
252 let hex1 = self
253 .consume_char()?
254 .ok_or_else(|| ParseError::SyntaxError {
255 position: self.position,
256 message: "Incomplete hex code in name".to_string(),
257 })?;
258 let hex2 = self
259 .consume_char()?
260 .ok_or_else(|| ParseError::SyntaxError {
261 position: self.position,
262 message: "Incomplete hex code in name".to_string(),
263 })?;
264
265 let value = u8::from_str_radix(&format!("{}{}", hex1 as char, hex2 as char), 16)
266 .map_err(|_| ParseError::SyntaxError {
267 position: self.position,
268 message: "Invalid hex code in name".to_string(),
269 })?;
270
271 name.push(value as char);
272 } else {
273 name.push(ch as char);
274 }
275 }
276
277 Ok(Token::Name(name))
278 }
279
280 fn read_literal_string(&mut self) -> ParseResult<Token> {
282 self.consume_char()?; let mut string = Vec::new();
284 let mut paren_depth = 1;
285 let mut escape = false;
286
287 while paren_depth > 0 {
288 let ch = match self.consume_char()? {
289 Some(c) => c,
290 None => {
291 if self.options.lenient_syntax {
292 if self.options.collect_warnings {
294 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
295 position: self.position,
296 expected: "closing parenthesis".to_string(),
297 found: "EOF".to_string(),
298 recovery_action: "returned partial string content".to_string(),
299 });
300 }
301 break;
302 } else {
303 return Err(ParseError::SyntaxError {
304 position: self.position,
305 message: "Unterminated string".to_string(),
306 });
307 }
308 }
309 };
310
311 if escape {
312 let escaped = match ch {
313 b'n' => b'\n',
314 b'r' => b'\r',
315 b't' => b'\t',
316 b'b' => b'\x08',
317 b'f' => b'\x0C',
318 b'(' => b'(',
319 b')' => b')',
320 b'\\' => b'\\',
321 b'0'..=b'7' => {
322 let mut value = ch - b'0';
324 for _ in 0..2 {
325 if let Some(next) = self.peek_char()? {
326 if matches!(next, b'0'..=b'7') {
327 self.consume_char()?;
328 value = value * 8 + (next - b'0');
329 } else {
330 break;
331 }
332 }
333 }
334 value
335 }
336 _ => ch, };
338 string.push(escaped);
339 escape = false;
340 } else {
341 match ch {
342 b'\\' => escape = true,
343 b'(' => {
344 string.push(ch);
345 paren_depth += 1;
346 }
347 b')' => {
348 paren_depth -= 1;
349 if paren_depth > 0 {
350 string.push(ch);
351 }
352 }
353 _ => string.push(ch),
354 }
355 }
356 }
357
358 let processed_string = if self.options.lenient_encoding {
360 self.process_string_with_encoding_recovery(&string)?
361 } else {
362 string
363 };
364
365 Ok(Token::String(processed_string))
366 }
367
368 fn read_angle_bracket(&mut self) -> ParseResult<Token> {
370 self.consume_char()?; if self.peek_char()? == Some(b'<') {
373 self.consume_char()?;
374 Ok(Token::DictStart)
375 } else {
376 let mut hex_chars = String::new();
378 let mut found_end = false;
379
380 while let Some(ch) = self.peek_char()? {
381 if ch == b'>' {
382 self.consume_char()?;
383 found_end = true;
384 break;
385 }
386 self.consume_char()?;
387 if ch.is_ascii_hexdigit() {
388 hex_chars.push(ch as char);
389 } else if !ch.is_ascii_whitespace() {
390 if self.options.lenient_syntax {
391 if self.options.collect_warnings {
393 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
394 position: self.position,
395 expected: "hex digit".to_string(),
396 found: format!("'{}'", ch as char),
397 recovery_action: "skipped invalid character".to_string(),
398 });
399 }
400 } else {
401 return Err(ParseError::SyntaxError {
402 position: self.position,
403 message: "Invalid character in hex string".to_string(),
404 });
405 }
406 }
407 }
408
409 if !found_end {
410 if self.options.lenient_syntax {
411 if self.options.collect_warnings {
413 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
414 position: self.position,
415 expected: ">".to_string(),
416 found: "EOF".to_string(),
417 recovery_action: "returned partial hex string".to_string(),
418 });
419 }
420 } else {
421 return Err(ParseError::SyntaxError {
422 position: self.position,
423 message: "Unterminated hex string".to_string(),
424 });
425 }
426 }
427
428 if hex_chars.len() % 2 != 0 {
430 hex_chars.push('0');
431 }
432
433 let mut bytes = Vec::new();
435 for chunk in hex_chars.as_bytes().chunks(2) {
436 let hex_str = std::str::from_utf8(chunk).map_err(|_| ParseError::SyntaxError {
437 position: self.position,
438 message: "Invalid UTF-8 in hex string".to_string(),
439 })?;
440 let byte =
441 u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
442 position: self.position,
443 message: "Invalid hex string".to_string(),
444 })?;
445 bytes.push(byte);
446 }
447
448 Ok(Token::String(bytes))
449 }
450 }
451
452 fn read_boolean(&mut self) -> ParseResult<Token> {
454 let word = self.read_word()?;
455 match word.as_str() {
456 "true" => Ok(Token::Boolean(true)),
457 "false" => Ok(Token::Boolean(false)),
458 _ => {
459 self.process_keyword(word)
461 }
462 }
463 }
464
465 fn read_null(&mut self) -> ParseResult<Token> {
467 let word = self.read_word()?;
468 if word == "null" {
469 Ok(Token::Null)
470 } else {
471 self.process_keyword(word)
473 }
474 }
475
476 fn read_number(&mut self) -> ParseResult<Token> {
478 let mut number_str = String::new();
479 let mut has_dot = false;
480
481 if let Some(ch) = self.peek_char()? {
483 if ch == b'+' || ch == b'-' {
484 self.consume_char()?;
485 number_str.push(ch as char);
486
487 if let Some(next) = self.peek_char()? {
489 if !next.is_ascii_digit() && next != b'.' {
490 return Err(ParseError::SyntaxError {
491 position: self.position,
492 message: "Expected digit after sign".to_string(),
493 });
494 }
495 }
496 }
497 }
498
499 while let Some(ch) = self.peek_char()? {
501 match ch {
502 b'0'..=b'9' => {
503 self.consume_char()?;
504 number_str.push(ch as char);
505 }
506 b'.' if !has_dot => {
507 self.consume_char()?;
508 number_str.push(ch as char);
509 has_dot = true;
510 }
511 _ => break,
512 }
513 }
514
515 if let Some(ch) = self.peek_char()? {
517 if ch == b'e' || ch == b'E' {
518 self.consume_char()?;
519 number_str.push(ch as char);
520
521 if let Some(sign_ch) = self.peek_char()? {
523 if sign_ch == b'+' || sign_ch == b'-' {
524 self.consume_char()?;
525 number_str.push(sign_ch as char);
526 }
527 }
528
529 while let Some(digit_ch) = self.peek_char()? {
531 if digit_ch.is_ascii_digit() {
532 self.consume_char()?;
533 number_str.push(digit_ch as char);
534 } else {
535 break;
536 }
537 }
538
539 has_dot = true;
541 }
542 }
543
544 if has_dot {
549 let value = number_str
550 .parse::<f64>()
551 .map_err(|_| ParseError::SyntaxError {
552 position: self.position,
553 message: format!("Invalid real number: '{number_str}'"),
554 })?;
555 Ok(Token::Real(value))
556 } else {
557 let value = number_str
558 .parse::<i64>()
559 .map_err(|_| ParseError::SyntaxError {
560 position: self.position,
561 message: format!("Invalid integer: '{number_str}'"),
562 })?;
563 Ok(Token::Integer(value))
564 }
565 }
566
567 fn read_keyword(&mut self) -> ParseResult<Token> {
569 let word = self.read_word()?;
570 self.process_keyword(word)
571 }
572
573 fn process_keyword(&self, word: String) -> ParseResult<Token> {
575 match word.as_str() {
576 "stream" => Ok(Token::Stream),
577 "endstream" => Ok(Token::EndStream),
578 "obj" => Ok(Token::Obj),
579 "endobj" => Ok(Token::EndObj),
580 "startxref" => Ok(Token::StartXRef),
581 _ => Err(ParseError::SyntaxError {
582 position: self.position,
583 message: format!("Unknown keyword: {word}"),
584 }),
585 }
586 }
587
588 fn read_word(&mut self) -> ParseResult<String> {
590 let mut word = String::new();
591
592 while let Some(ch) = self.peek_char()? {
593 if ch.is_ascii_whitespace()
594 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
595 {
596 break;
597 }
598 self.consume_char()?;
599 word.push(ch as char);
600 }
601
602 Ok(word)
603 }
604
605 #[allow(dead_code)]
607 fn read_digits(&mut self) -> ParseResult<String> {
608 let mut digits = String::new();
609
610 while let Some(ch) = self.peek_char()? {
611 if ch.is_ascii_digit() {
612 self.consume_char()?;
613 digits.push(ch as char);
614 } else {
615 break;
616 }
617 }
618
619 Ok(digits)
620 }
621
622 pub fn read_newline(&mut self) -> ParseResult<()> {
624 match self.peek_char()? {
625 Some(b'\r') => {
626 self.consume_char()?;
627 if self.peek_char()? == Some(b'\n') {
629 self.consume_char()?;
630 }
631 Ok(())
632 }
633 Some(b'\n') => {
634 self.consume_char()?;
635 Ok(())
636 }
637 _ => Err(ParseError::SyntaxError {
638 position: self.position,
639 message: "Expected newline".to_string(),
640 }),
641 }
642 }
643
644 pub fn peek_byte(&mut self) -> ParseResult<u8> {
647 match self.peek_char()? {
648 Some(b) => Ok(b),
649 None => Err(ParseError::UnexpectedToken {
650 expected: "byte".to_string(),
651 found: "EOF".to_string(),
652 }),
653 }
654 }
655
656 pub fn read_byte(&mut self) -> ParseResult<u8> {
658 match self.consume_char()? {
659 Some(b) => Ok(b),
660 None => Err(ParseError::UnexpectedToken {
661 expected: "byte".to_string(),
662 found: "EOF".to_string(),
663 }),
664 }
665 }
666
667 pub fn seek(&mut self, pos: u64) -> ParseResult<()>
669 where
670 R: Seek,
671 {
672 self.reader.seek(SeekFrom::Start(pos))?;
673 self.position = pos as usize;
674 Ok(())
675 }
676
677 pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
678 let mut bytes = Vec::with_capacity(n);
679
680 if self.peek_buffer.is_some() && n > 0 {
682 if let Some(byte) = self.consume_char()? {
683 bytes.push(byte);
684 }
685 }
686
687 let remaining = n - bytes.len();
689 if remaining > 0 {
690 let mut rest = vec![0u8; remaining];
691 self.reader.read_exact(&mut rest)?;
692 self.position += remaining;
693 bytes.extend_from_slice(&rest);
694 }
695
696 Ok(bytes)
697 }
698
699 pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
701 let mut result = Vec::new();
702 let mut match_pos = 0;
703
704 while let Some(ch) = self.consume_char()? {
705 result.push(ch);
706
707 if ch == sequence[match_pos] {
708 match_pos += 1;
709 if match_pos == sequence.len() {
710 result.truncate(result.len() - sequence.len());
712 break;
713 }
714 } else if ch == sequence[0] {
715 match_pos = 1;
716 } else {
717 match_pos = 0;
718 }
719 }
720
721 if match_pos < sequence.len() {
722 return Err(ParseError::SyntaxError {
723 position: self.position,
724 message: format!("Sequence {sequence:?} not found"),
725 });
726 }
727
728 Ok(result)
729 }
730
731 pub fn position(&self) -> usize {
733 self.position
734 }
735
736 pub fn push_token(&mut self, token: Token) {
738 self.token_buffer.push(token);
739 }
740
741 pub fn expect_keyword(&mut self, keyword: &str) -> ParseResult<()> {
743 let token = self.next_token()?;
744 match (keyword, &token) {
745 ("endstream", Token::EndStream) => Ok(()),
746 ("stream", Token::Stream) => Ok(()),
747 ("endobj", Token::EndObj) => Ok(()),
748 ("obj", Token::Obj) => Ok(()),
749 ("startxref", Token::StartXRef) => Ok(()),
750 _ => Err(ParseError::UnexpectedToken {
751 expected: format!("keyword '{keyword}'"),
752 found: format!("{token:?}"),
753 }),
754 }
755 }
756
757 pub fn find_keyword_ahead(
760 &mut self,
761 keyword: &str,
762 max_bytes: usize,
763 ) -> ParseResult<Option<usize>>
764 where
765 R: Seek,
766 {
767 use std::io::{Read, Seek, SeekFrom};
768
769 let current_pos = self.reader.stream_position()?;
771 let start_buffer_state = self.peek_buffer;
772
773 let keyword_bytes = keyword.as_bytes();
774 let mut bytes_read = 0;
775 let mut match_buffer = Vec::new();
776
777 while bytes_read < max_bytes {
779 let mut byte = [0u8; 1];
780 match self.reader.read_exact(&mut byte) {
781 Ok(_) => {
782 bytes_read += 1;
783 match_buffer.push(byte[0]);
784
785 if match_buffer.len() > keyword_bytes.len() {
787 match_buffer.remove(0);
788 }
789
790 if match_buffer.len() == keyword_bytes.len() && match_buffer == keyword_bytes {
792 self.reader.seek(SeekFrom::Start(current_pos))?;
794 self.peek_buffer = start_buffer_state;
795 return Ok(Some(bytes_read - keyword_bytes.len()));
796 }
797 }
798 Err(_) => break, }
800 }
801
802 self.reader.seek(SeekFrom::Start(current_pos))?;
804 self.peek_buffer = start_buffer_state;
805 Ok(None)
806 }
807
808 pub fn peek_ahead(&mut self, n: usize) -> ParseResult<Vec<u8>>
810 where
811 R: Seek,
812 {
813 use std::io::{Read, Seek, SeekFrom};
814
815 let current_pos = self.reader.stream_position()?;
817 let start_buffer_state = self.peek_buffer;
818
819 let mut bytes = vec![0u8; n];
821 let bytes_read = self.reader.read(&mut bytes)?;
822 bytes.truncate(bytes_read);
823
824 self.reader.seek(SeekFrom::Start(current_pos))?;
826 self.peek_buffer = start_buffer_state;
827
828 Ok(bytes)
829 }
830
831 pub fn save_position(&mut self) -> ParseResult<(u64, Option<u8>)>
833 where
834 R: Seek,
835 {
836 use std::io::Seek;
837 let pos = self.reader.stream_position()?;
838 Ok((pos, self.peek_buffer))
839 }
840
841 pub fn restore_position(&mut self, saved: (u64, Option<u8>)) -> ParseResult<()>
843 where
844 R: Seek,
845 {
846 use std::io::{Seek, SeekFrom};
847 self.reader.seek(SeekFrom::Start(saved.0))?;
848 self.peek_buffer = saved.1;
849 self.position = saved.0 as usize;
850 Ok(())
851 }
852
853 pub fn peek_token(&mut self) -> ParseResult<Token>
855 where
856 R: Seek,
857 {
858 let saved_pos = self.save_position()?;
859 let token = self.next_token()?;
860 self.restore_position(saved_pos)?;
861 Ok(token)
862 }
863
864 fn process_string_with_encoding_recovery(
866 &mut self,
867 string_bytes: &[u8],
868 ) -> ParseResult<Vec<u8>> {
869 use super::encoding::{CharacterDecoder, EncodingOptions, EncodingType, EnhancedDecoder};
870
871 let has_problematic_chars = string_bytes.iter().any(|&b| {
873 (0x80..=0x9F).contains(&b)
875 || b == 0x07
876 || (b <= 0x1F && b != 0x09 && b != 0x0A && b != 0x0D)
877 });
878
879 let decoder = EnhancedDecoder::new();
880
881 let encoding_options = if has_problematic_chars {
883 EncodingOptions {
884 lenient_mode: true, preferred_encoding: Some(EncodingType::Windows1252), max_replacements: std::cmp::max(100, string_bytes.len() / 10), log_issues: self.options.collect_warnings,
888 }
889 } else {
890 EncodingOptions {
891 lenient_mode: self.options.lenient_encoding,
892 preferred_encoding: self.options.preferred_encoding,
893 max_replacements: 50,
894 log_issues: self.options.collect_warnings,
895 }
896 };
897
898 match decoder.decode(string_bytes, &encoding_options) {
899 Ok(result) => {
900 if (result.replacement_count > 0 || has_problematic_chars)
902 && self.options.collect_warnings
903 {
904 self.warnings.push(ParseWarning::InvalidEncoding {
905 position: self.position,
906 recovered_text: if result.text.len() > 50 {
907 let truncate_at = result
909 .text
910 .char_indices()
911 .map(|(i, _)| i)
912 .nth(47)
913 .unwrap_or_else(|| {
914 let limit = result.text.len().min(47);
916 let mut pos = limit;
917 while pos > 0 && !result.text.is_char_boundary(pos) {
918 pos -= 1;
919 }
920 pos
921 });
922
923 let safe_text = if truncate_at <= result.text.len()
925 && result.text.is_char_boundary(truncate_at)
926 {
927 result.text[..truncate_at].to_string()
928 } else {
929 result.text.chars().take(47).collect::<String>()
931 };
932
933 format!(
934 "{}... (truncated, {} chars total)",
935 safe_text,
936 result.text.chars().count()
937 )
938 } else {
939 result.text.clone()
940 },
941 encoding_used: result.detected_encoding,
942 replacement_count: result.replacement_count,
943 });
944 }
945
946 Ok(result.text.into_bytes())
948 }
949 Err(encoding_error) => {
950 if self.options.lenient_encoding {
951 let fallback_result = self.apply_fallback_encoding_strategy(string_bytes);
953
954 if self.options.collect_warnings {
955 self.warnings.push(ParseWarning::InvalidEncoding {
956 position: self.position,
957 recovered_text: format!(
958 "Fallback strategy applied: {} -> {} chars",
959 string_bytes.len(),
960 fallback_result.len()
961 ),
962 encoding_used: None,
963 replacement_count: string_bytes.len(),
964 });
965 }
966 Ok(fallback_result)
967 } else {
968 Err(ParseError::CharacterEncodingError {
969 position: self.position,
970 message: format!(
971 "Failed to decode string with any supported encoding: {encoding_error}"
972 ),
973 })
974 }
975 }
976 }
977 }
978
979 fn apply_fallback_encoding_strategy(&self, string_bytes: &[u8]) -> Vec<u8> {
981 let mut result = Vec::with_capacity(string_bytes.len());
982
983 for &byte in string_bytes {
984 match byte {
985 0x00..=0x08 | 0x0B | 0x0C | 0x0E..=0x1F => {
987 result.push(b' '); }
989 0x80..=0x9F => {
990 let replacement = match byte {
992 0x80 => b'E', 0x81 => b' ', 0x82 => b',', 0x83 => b'f', 0x84 => b'"', 0x85 => b'.', 0x86 => b'+', 0x87 => b'+', 0x88 => b'^', 0x89 => b'%', 0x8A => b'S', 0x8B => b'<', 0x8C => b'O', 0x8D => b' ', 0x8E => b'Z', 0x8F => b' ', 0x90 => b' ', 0x91 => b'\'', 0x92 => b'\'', 0x93 => b'"', 0x94 => b'"', 0x95 => b'*', 0x96 => b'-', 0x97 => b'-', 0x98 => b'~', 0x99 => b'T', 0x9A => b's', 0x9B => b'>', 0x9C => b'o', 0x9D => b' ', 0x9E => b'z', 0x9F => b'Y', _ => b'?', };
1026 result.push(replacement);
1027 }
1028 _ => {
1029 result.push(byte); }
1031 }
1032 }
1033
1034 result
1035 }
1036
1037 fn is_problematic_encoding_char(&self, ch: u8) -> bool {
1039 (0x80..=0x9F).contains(&ch) ||
1041 ch == 0x07 || (ch <= 0x1F && ch != 0x09 && ch != 0x0A && ch != 0x0D) || (self.options.lenient_syntax && ch >= 0xA0) }
1046
1047 fn handle_encoding_char_in_token_stream(&mut self, ch: u8) -> ParseResult<Token> {
1049 if self.options.lenient_encoding {
1050 self.consume_char()?;
1052
1053 if self.options.collect_warnings {
1055 let replacement_char = match ch {
1056 0x07 => "bell",
1057 0x00..=0x1F => "control",
1058 0x80..=0x9F => "latin1-supplement",
1059 _ => "unknown",
1060 };
1061
1062 self.warnings.push(ParseWarning::InvalidEncoding {
1063 position: self.position,
1064 recovered_text: format!(
1065 "Skipped problematic {replacement_char} character (0x{ch:02X})"
1066 ),
1067 encoding_used: None,
1068 replacement_count: 1,
1069 });
1070 }
1071
1072 self.skip_whitespace()?;
1074 if let Ok(Some(_)) = self.peek_char() {
1075 self.next_token() } else {
1077 Err(ParseError::SyntaxError {
1078 position: self.position,
1079 message: "Unexpected end of file after problematic character".to_string(),
1080 })
1081 }
1082 } else {
1083 let char_description = match ch {
1085 0x07 => "Bell character (\\u{07})".to_string(),
1086 0x00..=0x1F => format!("Control character (\\u{{{ch:02X}}})"),
1087 0x80..=0x9F => format!("Latin-1 supplement character (\\u{{{ch:02X}}})"),
1088 _ => format!("Problematic character (\\u{{{ch:02X}}})"),
1089 };
1090
1091 Err(ParseError::CharacterEncodingError {
1092 position: self.position,
1093 message: format!(
1094 "Unexpected character: {char_description} - Consider using lenient parsing mode"
1095 ),
1096 })
1097 }
1098 }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103 use super::*;
1104 use std::io::Cursor;
1105
1106 #[test]
1107 fn test_lexer_basic_tokens() {
1108 let input = b"123 -456 3.14 true false null /Name";
1110 let mut lexer = Lexer::new(Cursor::new(input));
1111
1112 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1113 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
1114 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1115 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1116 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1117 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1118 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1119 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1120 }
1121
1122 #[test]
1123 fn test_lexer_negative_numbers() {
1124 let input = b"-123 -45.67";
1126 let mut lexer = Lexer::new(Cursor::new(input));
1127
1128 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
1129 assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
1130 }
1131
1132 #[test]
1133 fn test_lexer_strings() {
1134 let input = b"(Hello World) <48656C6C6F>";
1135 let mut lexer = Lexer::new(Cursor::new(input));
1136
1137 assert_eq!(
1138 lexer.next_token().unwrap(),
1139 Token::String(b"Hello World".to_vec())
1140 );
1141 assert_eq!(
1142 lexer.next_token().unwrap(),
1143 Token::String(b"Hello".to_vec())
1144 );
1145 }
1146
1147 #[test]
1148 fn test_lexer_dictionaries() {
1149 let input = b"<< /Type /Page >>";
1150 let mut lexer = Lexer::new(Cursor::new(input));
1151
1152 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1153 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1154 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1155 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1156 }
1157
1158 #[test]
1159 fn test_lexer_arrays() {
1160 let input = b"[1 2 3]";
1161 let mut lexer = Lexer::new(Cursor::new(input));
1162
1163 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1164 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1165 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1166 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1167 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1168 }
1169
1170 #[test]
1171 fn test_lexer_references() {
1172 let input = b"1 0 R 25 1 R";
1173 let mut lexer = Lexer::new(Cursor::new(input));
1174
1175 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1177 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1178 match lexer.next_token().unwrap() {
1180 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1182 }
1183
1184 assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
1185 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1186 match lexer.next_token().unwrap() {
1187 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1189 }
1190 }
1191
1192 #[test]
1193 fn test_lexer_comments() {
1194 let input = b"%PDF-1.7\n123";
1195 let mut lexer = Lexer::new(Cursor::new(input));
1196
1197 assert_eq!(
1198 lexer.next_token().unwrap(),
1199 Token::Comment("PDF-1.7".to_string())
1200 );
1201 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1202 }
1203
1204 mod comprehensive_tests {
1206 use super::*;
1207 use std::io::Cursor;
1208
1209 #[test]
1210 fn test_token_debug_trait() {
1211 let token = Token::Integer(42);
1212 let debug_str = format!("{token:?}");
1213 assert!(debug_str.contains("Integer"));
1214 assert!(debug_str.contains("42"));
1215 }
1216
1217 #[test]
1218 fn test_token_clone() {
1219 let token = Token::String(b"hello".to_vec());
1220 let cloned = token.clone();
1221 assert_eq!(token, cloned);
1222 }
1223
1224 #[test]
1225 fn test_token_equality() {
1226 assert_eq!(Token::Integer(42), Token::Integer(42));
1227 assert_ne!(Token::Integer(42), Token::Integer(43));
1228 assert_eq!(Token::Boolean(true), Token::Boolean(true));
1229 assert_ne!(Token::Boolean(true), Token::Boolean(false));
1230 assert_eq!(Token::Null, Token::Null);
1231 assert_ne!(Token::Null, Token::Integer(0));
1232 }
1233
1234 #[test]
1235 fn test_lexer_empty_input() {
1236 let input = b"";
1237 let mut lexer = Lexer::new(Cursor::new(input));
1238 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1239 }
1240
1241 #[test]
1242 fn test_lexer_whitespace_only() {
1243 let input = b" \t\n\r ";
1244 let mut lexer = Lexer::new(Cursor::new(input));
1245 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1246 }
1247
1248 #[test]
1249 fn test_lexer_integer_edge_cases() {
1250 let input = b"0 +123 -0 9876543210";
1251 let mut lexer = Lexer::new(Cursor::new(input));
1252
1253 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1254 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1255 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1256 assert_eq!(lexer.next_token().unwrap(), Token::Integer(9876543210));
1257 }
1258
1259 #[test]
1260 fn test_lexer_real_edge_cases() {
1261 let input = b"0.0 +3.14 -2.71828 .5 5. 123.456789";
1262 let mut lexer = Lexer::new(Cursor::new(input));
1263
1264 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.0));
1265 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1266 assert_eq!(lexer.next_token().unwrap(), Token::Real(-2.71828));
1267 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.5));
1268 assert_eq!(lexer.next_token().unwrap(), Token::Real(5.0));
1269 assert_eq!(lexer.next_token().unwrap(), Token::Real(123.456789));
1270 }
1271
1272 #[test]
1273 fn test_lexer_scientific_notation() {
1274 let input = b"1.23e10 -4.56E-5 1e0 2E+3";
1275 let mut lexer = Lexer::new(Cursor::new(input));
1276
1277 assert_eq!(lexer.next_token().unwrap(), Token::Real(1.23e10));
1278 assert_eq!(lexer.next_token().unwrap(), Token::Real(-4.56e-5));
1279 assert_eq!(lexer.next_token().unwrap(), Token::Real(1e0));
1280 assert_eq!(lexer.next_token().unwrap(), Token::Real(2e3));
1281 }
1282
1283 #[test]
1284 fn test_lexer_string_literal_escapes() {
1285 let input = b"(Hello\\nWorld) (Tab\\tChar) (Quote\\\"Mark) (Backslash\\\\)";
1286 let mut lexer = Lexer::new(Cursor::new(input));
1287
1288 assert_eq!(
1289 lexer.next_token().unwrap(),
1290 Token::String(b"Hello\nWorld".to_vec())
1291 );
1292 assert_eq!(
1293 lexer.next_token().unwrap(),
1294 Token::String(b"Tab\tChar".to_vec())
1295 );
1296 assert_eq!(
1297 lexer.next_token().unwrap(),
1298 Token::String(b"Quote\"Mark".to_vec())
1299 );
1300 assert_eq!(
1301 lexer.next_token().unwrap(),
1302 Token::String(b"Backslash\\".to_vec())
1303 );
1304 }
1305
1306 #[test]
1307 fn test_lexer_string_literal_nested_parens() {
1308 let input = b"(Nested (parentheses) work)";
1309 let mut lexer = Lexer::new(Cursor::new(input));
1310
1311 assert_eq!(
1312 lexer.next_token().unwrap(),
1313 Token::String(b"Nested (parentheses) work".to_vec())
1314 );
1315 }
1316
1317 #[test]
1318 fn test_lexer_string_literal_empty() {
1319 let input = b"()";
1320 let mut lexer = Lexer::new(Cursor::new(input));
1321
1322 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1323 }
1324
1325 #[test]
1326 fn test_lexer_hexadecimal_strings() {
1327 let input = b"<48656C6C6F> <20576F726C64> <>";
1328 let mut lexer = Lexer::new(Cursor::new(input));
1329
1330 assert_eq!(
1331 lexer.next_token().unwrap(),
1332 Token::String(b"Hello".to_vec())
1333 );
1334 assert_eq!(
1335 lexer.next_token().unwrap(),
1336 Token::String(b" World".to_vec())
1337 );
1338 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1339 }
1340
1341 #[test]
1342 fn test_lexer_hexadecimal_strings_odd_length() {
1343 let input = b"<48656C6C6F2> <1> <ABC>";
1344 let mut lexer = Lexer::new(Cursor::new(input));
1345
1346 assert_eq!(
1348 lexer.next_token().unwrap(),
1349 Token::String(b"Hello ".to_vec())
1350 );
1351 assert_eq!(lexer.next_token().unwrap(), Token::String(b"\x10".to_vec()));
1352 assert_eq!(
1353 lexer.next_token().unwrap(),
1354 Token::String(b"\xAB\xC0".to_vec())
1355 );
1356 }
1357
1358 #[test]
1359 fn test_lexer_hexadecimal_strings_whitespace() {
1360 let input = b"<48 65 6C 6C 6F>";
1361 let mut lexer = Lexer::new(Cursor::new(input));
1362
1363 assert_eq!(
1364 lexer.next_token().unwrap(),
1365 Token::String(b"Hello".to_vec())
1366 );
1367 }
1368
1369 #[test]
1370 fn test_lexer_names() {
1371 let input = b"/Type /Page /Root /Kids /Count /MediaBox";
1372 let mut lexer = Lexer::new(Cursor::new(input));
1373
1374 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1375 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1376 assert_eq!(lexer.next_token().unwrap(), Token::Name("Root".to_string()));
1377 assert_eq!(lexer.next_token().unwrap(), Token::Name("Kids".to_string()));
1378 assert_eq!(
1379 lexer.next_token().unwrap(),
1380 Token::Name("Count".to_string())
1381 );
1382 assert_eq!(
1383 lexer.next_token().unwrap(),
1384 Token::Name("MediaBox".to_string())
1385 );
1386 }
1387
1388 #[test]
1389 fn test_lexer_names_with_special_chars() {
1390 let input = b"/Name#20with#20spaces /Name#2Fwith#2Fslashes";
1391 let mut lexer = Lexer::new(Cursor::new(input));
1392
1393 assert_eq!(
1394 lexer.next_token().unwrap(),
1395 Token::Name("Name with spaces".to_string())
1396 );
1397 assert_eq!(
1398 lexer.next_token().unwrap(),
1399 Token::Name("Name/with/slashes".to_string())
1400 );
1401 }
1402
1403 #[test]
1404 fn test_lexer_names_edge_cases() {
1405 let input = b"/ /A /123 /true /false /null";
1406 let mut lexer = Lexer::new(Cursor::new(input));
1407
1408 assert_eq!(lexer.next_token().unwrap(), Token::Name("".to_string()));
1409 assert_eq!(lexer.next_token().unwrap(), Token::Name("A".to_string()));
1410 assert_eq!(lexer.next_token().unwrap(), Token::Name("123".to_string()));
1411 assert_eq!(lexer.next_token().unwrap(), Token::Name("true".to_string()));
1412 assert_eq!(
1413 lexer.next_token().unwrap(),
1414 Token::Name("false".to_string())
1415 );
1416 assert_eq!(lexer.next_token().unwrap(), Token::Name("null".to_string()));
1417 }
1418
1419 #[test]
1420 fn test_lexer_nested_dictionaries() {
1421 let input = b"<< /Type /Page /Resources << /Font << /F1 123 0 R >> >> >>";
1422 let mut lexer = Lexer::new(Cursor::new(input));
1423
1424 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1425 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1426 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1427 assert_eq!(
1428 lexer.next_token().unwrap(),
1429 Token::Name("Resources".to_string())
1430 );
1431 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1432 assert_eq!(lexer.next_token().unwrap(), Token::Name("Font".to_string()));
1433 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1434 assert_eq!(lexer.next_token().unwrap(), Token::Name("F1".to_string()));
1435 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1436 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1437 assert_eq!(lexer.next_token().unwrap(), Token::Name("R".to_string()));
1438 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1439 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1440 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1441 }
1442
1443 #[test]
1444 fn test_lexer_nested_arrays() {
1445 let input = b"[[1 2] [3 4] [5 [6 7]]]";
1446 let mut lexer = Lexer::new(Cursor::new(input));
1447
1448 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1449 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1450 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1451 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1452 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1453 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1454 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1455 assert_eq!(lexer.next_token().unwrap(), Token::Integer(4));
1456 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1457 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1458 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1459 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1460 assert_eq!(lexer.next_token().unwrap(), Token::Integer(6));
1461 assert_eq!(lexer.next_token().unwrap(), Token::Integer(7));
1462 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1463 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1464 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1465 }
1466
1467 #[test]
1468 fn test_lexer_mixed_content() {
1469 let input = b"<< /Type /Page /MediaBox [0 0 612 792] /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 >> >> >> >>";
1470 let mut lexer = Lexer::new(Cursor::new(input));
1471
1472 let mut tokens = Vec::new();
1474 loop {
1475 match lexer.next_token().unwrap() {
1476 Token::Eof => break,
1477 token => tokens.push(token),
1478 }
1479 }
1480 assert!(tokens.len() > 10);
1481 }
1482
1483 #[test]
1484 fn test_lexer_keywords() {
1485 let input = b"obj endobj stream endstream startxref";
1486 let mut lexer = Lexer::new(Cursor::new(input));
1487
1488 assert_eq!(lexer.next_token().unwrap(), Token::Obj);
1489 assert_eq!(lexer.next_token().unwrap(), Token::EndObj);
1490 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1491 assert_eq!(lexer.next_token().unwrap(), Token::EndStream);
1492 assert_eq!(lexer.next_token().unwrap(), Token::StartXRef);
1493 }
1494
1495 #[test]
1496 fn test_lexer_multiple_comments() {
1497 let input = b"%First comment\n%Second comment\n123";
1498 let mut lexer = Lexer::new(Cursor::new(input));
1499
1500 assert_eq!(
1501 lexer.next_token().unwrap(),
1502 Token::Comment("First comment".to_string())
1503 );
1504 assert_eq!(
1505 lexer.next_token().unwrap(),
1506 Token::Comment("Second comment".to_string())
1507 );
1508 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1509 }
1510
1511 #[test]
1512 fn test_lexer_comment_without_newline() {
1513 let input = b"%Comment at end";
1514 let mut lexer = Lexer::new(Cursor::new(input));
1515
1516 assert_eq!(
1517 lexer.next_token().unwrap(),
1518 Token::Comment("Comment at end".to_string())
1519 );
1520 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1521 }
1522
1523 #[test]
1524 fn test_lexer_special_characters_in_streams() {
1525 let input = b"<< /Length 5 >> stream\nHello endstream";
1526 let mut lexer = Lexer::new(Cursor::new(input));
1527
1528 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1529 assert_eq!(
1530 lexer.next_token().unwrap(),
1531 Token::Name("Length".to_string())
1532 );
1533 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1534 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1535 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1536 }
1538
1539 #[test]
1540 fn test_lexer_push_token() {
1541 let input = b"123 456";
1542 let mut lexer = Lexer::new(Cursor::new(input));
1543
1544 let token1 = lexer.next_token().unwrap();
1545 assert_eq!(token1, Token::Integer(123));
1546
1547 let token2 = lexer.next_token().unwrap();
1548 assert_eq!(token2, Token::Integer(456));
1549
1550 lexer.push_token(token2.clone());
1552
1553 let token3 = lexer.next_token().unwrap();
1555 assert_eq!(token3, token2);
1556
1557 let token4 = lexer.next_token().unwrap();
1559 assert_eq!(token4, Token::Eof);
1560 }
1561
1562 #[test]
1563 fn test_lexer_push_multiple_tokens() {
1564 let input = b"123";
1565 let mut lexer = Lexer::new(Cursor::new(input));
1566
1567 let original_token = lexer.next_token().unwrap();
1568 assert_eq!(original_token, Token::Integer(123));
1569
1570 lexer.push_token(Token::Boolean(true));
1572 lexer.push_token(Token::Boolean(false));
1573 lexer.push_token(Token::Null);
1574
1575 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1577 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1578 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1579 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1580 }
1581
1582 #[test]
1583 fn test_lexer_read_newline() {
1584 let input = b"123\n456\r\n789";
1585 let mut lexer = Lexer::new(Cursor::new(input));
1586
1587 let digits1 = lexer.read_digits().unwrap();
1589 assert_eq!(digits1, "123");
1590 assert!(lexer.read_newline().is_ok());
1591
1592 let digits2 = lexer.read_digits().unwrap();
1594 assert_eq!(digits2, "456");
1595 assert!(lexer.read_newline().is_ok());
1596
1597 let digits3 = lexer.read_digits().unwrap();
1599 assert_eq!(digits3, "789");
1600 }
1601
1602 #[test]
1603 fn test_lexer_read_bytes() {
1604 let input = b"Hello World";
1605 let mut lexer = Lexer::new(Cursor::new(input));
1606
1607 let bytes = lexer.read_bytes(5).unwrap();
1608 assert_eq!(bytes, b"Hello");
1609
1610 let bytes = lexer.read_bytes(6).unwrap();
1611 assert_eq!(bytes, b" World");
1612 }
1613
1614 #[test]
1615 fn test_lexer_read_until_sequence() {
1616 let input = b"Hello endstream World";
1617 let mut lexer = Lexer::new(Cursor::new(input));
1618
1619 let result = lexer.read_until_sequence(b"endstream").unwrap();
1620 assert_eq!(result, b"Hello ");
1621
1622 let rest = lexer.read_digits().unwrap();
1624 assert_eq!(rest, ""); }
1626
1627 #[test]
1628 fn test_lexer_read_until_sequence_not_found() {
1629 let input = b"Hello World";
1630 let mut lexer = Lexer::new(Cursor::new(input));
1631
1632 let result = lexer.read_until_sequence(b"notfound");
1633 assert!(result.is_err());
1634 }
1635
1636 #[test]
1637 fn test_lexer_position_tracking() {
1638 let input = b"123 456";
1639 let mut lexer = Lexer::new(Cursor::new(input));
1640
1641 let initial_pos = lexer.position();
1642 assert_eq!(initial_pos, 0);
1643
1644 lexer.next_token().unwrap(); let pos_after_first = lexer.position();
1646 assert!(pos_after_first > initial_pos);
1647
1648 lexer.next_token().unwrap(); let pos_after_second = lexer.position();
1650 assert!(pos_after_second > pos_after_first);
1651 }
1652
1653 #[test]
1654 fn test_lexer_large_numbers() {
1655 let input = b"2147483647 -2147483648 9223372036854775807 -9223372036854775808";
1656 let mut lexer = Lexer::new(Cursor::new(input));
1657
1658 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2147483647));
1659 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-2147483648));
1660 assert_eq!(
1661 lexer.next_token().unwrap(),
1662 Token::Integer(9223372036854775807)
1663 );
1664 assert_eq!(
1665 lexer.next_token().unwrap(),
1666 Token::Integer(-9223372036854775808)
1667 );
1668 }
1669
1670 #[test]
1671 fn test_lexer_very_long_string() {
1672 let long_str = "A".repeat(1000);
1673 let input = format!("({long_str})");
1674 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1675
1676 if let Token::String(s) = lexer.next_token().unwrap() {
1677 assert_eq!(s.len(), 1000);
1678 assert_eq!(s, long_str.as_bytes());
1679 } else {
1680 panic!("Expected string token");
1681 }
1682 }
1683
1684 #[test]
1685 fn test_lexer_very_long_name() {
1686 let long_name = "A".repeat(500);
1687 let input = format!("/{long_name}");
1688 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1689
1690 if let Token::Name(name) = lexer.next_token().unwrap() {
1691 assert_eq!(name.len(), 500);
1692 assert_eq!(name, long_name);
1693 } else {
1694 panic!("Expected name token");
1695 }
1696 }
1697
1698 #[test]
1699 fn test_lexer_error_handling_invalid_hex() {
1700 let input = b"<48656C6C6FG>";
1701 let mut lexer = Lexer::new(Cursor::new(input));
1702
1703 let result = lexer.next_token();
1705 assert!(result.is_ok() || result.is_err()); }
1707
1708 #[test]
1709 fn test_lexer_all_token_types() {
1710 let input = b"true false null 123 -456 3.14 (string) <48656C6C6F> /Name [ ] << >> obj endobj stream endstream startxref % comment\n";
1711 let mut lexer = Lexer::new(Cursor::new(input));
1712
1713 let mut token_types = Vec::new();
1714 loop {
1715 match lexer.next_token().unwrap() {
1716 Token::Eof => break,
1717 token => token_types.push(std::mem::discriminant(&token)),
1718 }
1719 }
1720
1721 assert!(token_types.len() > 10);
1723 }
1724
1725 #[test]
1726 fn test_lexer_performance() {
1727 let input = "123 456 789 ".repeat(1000);
1728 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1729
1730 let start_time = std::time::Instant::now();
1731 let mut count = 0;
1732 loop {
1733 match lexer.next_token().unwrap() {
1734 Token::Eof => break,
1735 _ => count += 1,
1736 }
1737 }
1738 let elapsed = start_time.elapsed();
1739
1740 assert_eq!(count, 3000); assert!(elapsed.as_millis() < 1000); }
1743 }
1744
1745 #[test]
1746 fn test_lexer_find_keyword_ahead() {
1747 let input = b"some data here endstream more data";
1748 let mut lexer = Lexer::new(Cursor::new(input));
1749
1750 let result = lexer.find_keyword_ahead("endstream", 100);
1752 assert!(result.is_ok());
1753 assert_eq!(result.unwrap(), Some(15)); let result2 = lexer.find_keyword_ahead("notfound", 100);
1757 assert!(result2.is_ok());
1758 assert_eq!(result2.unwrap(), None);
1759
1760 let result3 = lexer.find_keyword_ahead("endstream", 10);
1762 assert!(result3.is_ok());
1763 assert_eq!(result3.unwrap(), None); }
1765
1766 #[test]
1767 fn test_lexer_peek_token() {
1768 let input = b"123 456 /Name";
1769 let mut lexer = Lexer::new(Cursor::new(input));
1770
1771 let peeked = lexer.peek_token();
1773 assert!(peeked.is_ok());
1774 assert_eq!(peeked.unwrap(), Token::Integer(123));
1775
1776 let next = lexer.next_token();
1778 assert!(next.is_ok());
1779 assert_eq!(next.unwrap(), Token::Integer(123));
1780
1781 assert_eq!(lexer.peek_token().unwrap(), Token::Integer(456));
1783 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1784
1785 assert_eq!(lexer.peek_token().unwrap(), Token::Name("Name".to_string()));
1786 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1787 }
1788
1789 #[test]
1790 fn test_lexer_expect_keyword() {
1791 let input = b"endstream obj endobj";
1792 let mut lexer = Lexer::new(Cursor::new(input));
1793
1794 assert!(lexer.expect_keyword("endstream").is_ok());
1796
1797 assert!(lexer.expect_keyword("obj").is_ok());
1799
1800 let result = lexer.expect_keyword("stream");
1802 assert!(result.is_err());
1803 match result {
1804 Err(ParseError::UnexpectedToken { expected, found }) => {
1805 assert!(expected.contains("stream"));
1806 assert!(found.contains("EndObj"));
1807 }
1808 _ => panic!("Expected UnexpectedToken error"),
1809 }
1810 }
1811
1812 #[test]
1813 fn test_lexer_save_restore_position() {
1814 let input = b"123 456 789";
1815 let mut lexer = Lexer::new(Cursor::new(input));
1816
1817 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1819
1820 let saved = lexer.save_position();
1822 assert!(saved.is_ok());
1823 let saved_pos = saved.unwrap();
1824
1825 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1827 assert_eq!(lexer.next_token().unwrap(), Token::Integer(789));
1828
1829 assert!(lexer.restore_position(saved_pos).is_ok());
1831
1832 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1834 }
1835
1836 #[test]
1837 fn test_lexer_character_encoding_recovery() {
1838 let input = b"(Caf\x80 \x91Hello\x92)"; let options = ParseOptions::lenient();
1841 let mut lexer = Lexer::new_with_options(Cursor::new(input), options);
1842
1843 match lexer.next_token().unwrap() {
1844 Token::String(bytes) => {
1845 let text = String::from_utf8_lossy(&bytes);
1847 tracing::debug!("Recovered text: {text}");
1848 assert!(!text.is_empty()); }
1850 other => panic!("Expected String token, got {other:?}"),
1851 }
1852
1853 let warnings = lexer.warnings();
1855 if !warnings.is_empty() {
1856 tracing::debug!("Encoding warnings: {warnings:?}");
1857 }
1858 }
1859}