1use super::{ParseError, ParseOptions, ParseResult, ParseWarning};
6use std::io::{Read, Seek, SeekFrom};
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Boolean(bool),
13
14 Integer(i64),
16
17 Real(f64),
19
20 String(Vec<u8>),
22
23 Name(String),
25
26 ArrayStart,
28
29 ArrayEnd,
31
32 DictStart,
34
35 DictEnd,
37
38 Stream,
40
41 EndStream,
43
44 Obj,
46
47 EndObj,
49
50 StartXRef,
52
53 Reference(u32, u16),
55
56 Null,
58
59 Comment(String),
61
62 Eof,
64}
65
66pub struct Lexer<R> {
68 reader: std::io::BufReader<R>,
69 #[allow(dead_code)]
70 buffer: Vec<u8>,
71 position: usize,
72 peek_buffer: Option<u8>,
73 token_buffer: Vec<Token>,
74 options: ParseOptions,
75 warnings: Vec<ParseWarning>,
76}
77
78impl<R: Read> Lexer<R> {
79 pub fn new(reader: R) -> Self {
81 Self::new_with_options(reader, ParseOptions::default())
82 }
83
84 pub fn new_with_options(reader: R, options: ParseOptions) -> Self {
86 Self {
87 reader: std::io::BufReader::new(reader),
88 buffer: Vec::with_capacity(1024),
89 position: 0,
90 peek_buffer: None,
91 token_buffer: Vec::new(),
92 options,
93 warnings: Vec::new(),
94 }
95 }
96
97 pub fn warnings(&self) -> &[ParseWarning] {
99 &self.warnings
100 }
101
102 pub fn next_token(&mut self) -> ParseResult<Token> {
104 if let Some(token) = self.token_buffer.pop() {
106 return Ok(token);
107 }
108
109 self.skip_whitespace()?;
110
111 let ch = match self.peek_char()? {
112 Some(ch) => ch,
113 None => return Ok(Token::Eof),
114 };
115
116 match ch {
117 b'%' => self.read_comment(),
118 b'/' => self.read_name(),
119 b'(' => self.read_literal_string(),
120 b'<' => self.read_angle_bracket(),
121 b'>' => {
122 self.consume_char()?;
123 if self.peek_char()? == Some(b'>') {
124 self.consume_char()?;
125 Ok(Token::DictEnd)
126 } else {
127 Err(ParseError::SyntaxError {
128 position: self.position,
129 message: "Expected '>' after '>'".to_string(),
130 })
131 }
132 }
133 b'[' => {
134 self.consume_char()?;
135 Ok(Token::ArrayStart)
136 }
137 b']' => {
138 self.consume_char()?;
139 Ok(Token::ArrayEnd)
140 }
141 b't' | b'f' => self.read_boolean(),
142 b'n' => self.read_null(),
143 b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
144 b'R' => {
145 self.consume_char()?;
147 Ok(Token::Name("R".to_string()))
148 }
149 _ if ch.is_ascii_alphabetic() => self.read_keyword(),
150 b';' => {
151 self.consume_char()?;
153 self.next_token() }
155 _ => {
156 if self.is_problematic_encoding_char(ch) {
158 self.handle_encoding_char_in_token_stream(ch)
159 } else if self.options.lenient_syntax {
160 if self.options.collect_warnings {
162 eprintln!(
163 "Warning: Skipping unexpected character '{}' at position {}",
164 ch as char, self.position
165 );
166 }
167 self.consume_char()?;
168 self.next_token() } else {
170 Err(ParseError::SyntaxError {
171 position: self.position,
172 message: format!("Unexpected character: {}", ch as char),
173 })
174 }
175 }
176 }
177 }
178
179 fn peek_char(&mut self) -> ParseResult<Option<u8>> {
181 if let Some(ch) = self.peek_buffer {
182 return Ok(Some(ch));
183 }
184
185 let mut buf = [0u8; 1];
186 match self.reader.read_exact(&mut buf) {
187 Ok(_) => {
188 self.peek_buffer = Some(buf[0]);
189 Ok(Some(buf[0]))
190 }
191 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
192 Err(e) => Err(e.into()),
193 }
194 }
195
196 fn consume_char(&mut self) -> ParseResult<Option<u8>> {
198 let ch = self.peek_char()?;
199 if ch.is_some() {
200 self.peek_buffer = None;
201 self.position += 1;
202 }
203 Ok(ch)
204 }
205
206 pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
208 let mut count = 0;
209 while let Some(ch) = self.peek_char()? {
210 if ch.is_ascii_whitespace() {
211 self.consume_char()?;
212 count += 1;
213 } else {
214 break;
215 }
216 }
217 Ok(count)
218 }
219
220 fn read_comment(&mut self) -> ParseResult<Token> {
222 self.consume_char()?; let mut comment = String::new();
224
225 while let Some(ch) = self.peek_char()? {
226 if ch == b'\n' || ch == b'\r' {
227 break;
228 }
229 self.consume_char()?;
230 comment.push(ch as char);
231 }
232
233 Ok(Token::Comment(comment))
234 }
235
236 fn read_name(&mut self) -> ParseResult<Token> {
238 self.consume_char()?; let mut name = String::new();
240
241 while let Some(ch) = self.peek_char()? {
242 if ch.is_ascii_whitespace()
243 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
244 {
245 break;
246 }
247 self.consume_char()?;
248
249 if ch == b'#' {
251 let hex1 = self
252 .consume_char()?
253 .ok_or_else(|| ParseError::SyntaxError {
254 position: self.position,
255 message: "Incomplete hex code in name".to_string(),
256 })?;
257 let hex2 = self
258 .consume_char()?
259 .ok_or_else(|| ParseError::SyntaxError {
260 position: self.position,
261 message: "Incomplete hex code in name".to_string(),
262 })?;
263
264 let value = u8::from_str_radix(&format!("{}{}", hex1 as char, hex2 as char), 16)
265 .map_err(|_| ParseError::SyntaxError {
266 position: self.position,
267 message: "Invalid hex code in name".to_string(),
268 })?;
269
270 name.push(value as char);
271 } else {
272 name.push(ch as char);
273 }
274 }
275
276 Ok(Token::Name(name))
277 }
278
279 fn read_literal_string(&mut self) -> ParseResult<Token> {
281 self.consume_char()?; let mut string = Vec::new();
283 let mut paren_depth = 1;
284 let mut escape = false;
285
286 while paren_depth > 0 {
287 let ch = match self.consume_char()? {
288 Some(c) => c,
289 None => {
290 if self.options.lenient_syntax {
291 if self.options.collect_warnings {
293 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
294 position: self.position,
295 expected: "closing parenthesis".to_string(),
296 found: "EOF".to_string(),
297 recovery_action: "returned partial string content".to_string(),
298 });
299 }
300 break;
301 } else {
302 return Err(ParseError::SyntaxError {
303 position: self.position,
304 message: "Unterminated string".to_string(),
305 });
306 }
307 }
308 };
309
310 if escape {
311 let escaped = match ch {
312 b'n' => b'\n',
313 b'r' => b'\r',
314 b't' => b'\t',
315 b'b' => b'\x08',
316 b'f' => b'\x0C',
317 b'(' => b'(',
318 b')' => b')',
319 b'\\' => b'\\',
320 b'0'..=b'7' => {
321 let mut value = ch - b'0';
323 for _ in 0..2 {
324 if let Some(next) = self.peek_char()? {
325 if matches!(next, b'0'..=b'7') {
326 self.consume_char()?;
327 value = value * 8 + (next - b'0');
328 } else {
329 break;
330 }
331 }
332 }
333 value
334 }
335 _ => ch, };
337 string.push(escaped);
338 escape = false;
339 } else {
340 match ch {
341 b'\\' => escape = true,
342 b'(' => {
343 string.push(ch);
344 paren_depth += 1;
345 }
346 b')' => {
347 paren_depth -= 1;
348 if paren_depth > 0 {
349 string.push(ch);
350 }
351 }
352 _ => string.push(ch),
353 }
354 }
355 }
356
357 let processed_string = if self.options.lenient_encoding {
359 self.process_string_with_encoding_recovery(&string)?
360 } else {
361 string
362 };
363
364 Ok(Token::String(processed_string))
365 }
366
367 fn read_angle_bracket(&mut self) -> ParseResult<Token> {
369 self.consume_char()?; if self.peek_char()? == Some(b'<') {
372 self.consume_char()?;
373 Ok(Token::DictStart)
374 } else {
375 let mut hex_chars = String::new();
377 let mut found_end = false;
378
379 while let Some(ch) = self.peek_char()? {
380 if ch == b'>' {
381 self.consume_char()?;
382 found_end = true;
383 break;
384 }
385 self.consume_char()?;
386 if ch.is_ascii_hexdigit() {
387 hex_chars.push(ch as char);
388 } else if !ch.is_ascii_whitespace() {
389 if self.options.lenient_syntax {
390 if self.options.collect_warnings {
392 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
393 position: self.position,
394 expected: "hex digit".to_string(),
395 found: format!("'{}'", ch as char),
396 recovery_action: "skipped invalid character".to_string(),
397 });
398 }
399 } else {
400 return Err(ParseError::SyntaxError {
401 position: self.position,
402 message: "Invalid character in hex string".to_string(),
403 });
404 }
405 }
406 }
407
408 if !found_end {
409 if self.options.lenient_syntax {
410 if self.options.collect_warnings {
412 self.warnings.push(ParseWarning::SyntaxErrorRecovered {
413 position: self.position,
414 expected: ">".to_string(),
415 found: "EOF".to_string(),
416 recovery_action: "returned partial hex string".to_string(),
417 });
418 }
419 } else {
420 return Err(ParseError::SyntaxError {
421 position: self.position,
422 message: "Unterminated hex string".to_string(),
423 });
424 }
425 }
426
427 if hex_chars.len() % 2 != 0 {
429 hex_chars.push('0');
430 }
431
432 let mut bytes = Vec::new();
434 for chunk in hex_chars.as_bytes().chunks(2) {
435 let hex_str = std::str::from_utf8(chunk).map_err(|_| ParseError::SyntaxError {
436 position: self.position,
437 message: "Invalid UTF-8 in hex string".to_string(),
438 })?;
439 let byte =
440 u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
441 position: self.position,
442 message: "Invalid hex string".to_string(),
443 })?;
444 bytes.push(byte);
445 }
446
447 Ok(Token::String(bytes))
448 }
449 }
450
451 fn read_boolean(&mut self) -> ParseResult<Token> {
453 let word = self.read_word()?;
454 match word.as_str() {
455 "true" => Ok(Token::Boolean(true)),
456 "false" => Ok(Token::Boolean(false)),
457 _ => {
458 self.process_keyword(word)
460 }
461 }
462 }
463
464 fn read_null(&mut self) -> ParseResult<Token> {
466 let word = self.read_word()?;
467 if word == "null" {
468 Ok(Token::Null)
469 } else {
470 self.process_keyword(word)
472 }
473 }
474
475 fn read_number(&mut self) -> ParseResult<Token> {
477 let mut number_str = String::new();
478 let mut has_dot = false;
479
480 if let Some(ch) = self.peek_char()? {
482 if ch == b'+' || ch == b'-' {
483 self.consume_char()?;
484 number_str.push(ch as char);
485
486 if let Some(next) = self.peek_char()? {
488 if !next.is_ascii_digit() && next != b'.' {
489 return Err(ParseError::SyntaxError {
490 position: self.position,
491 message: "Expected digit after sign".to_string(),
492 });
493 }
494 }
495 }
496 }
497
498 while let Some(ch) = self.peek_char()? {
500 match ch {
501 b'0'..=b'9' => {
502 self.consume_char()?;
503 number_str.push(ch as char);
504 }
505 b'.' if !has_dot => {
506 self.consume_char()?;
507 number_str.push(ch as char);
508 has_dot = true;
509 }
510 _ => break,
511 }
512 }
513
514 if let Some(ch) = self.peek_char()? {
516 if ch == b'e' || ch == b'E' {
517 self.consume_char()?;
518 number_str.push(ch as char);
519
520 if let Some(sign_ch) = self.peek_char()? {
522 if sign_ch == b'+' || sign_ch == b'-' {
523 self.consume_char()?;
524 number_str.push(sign_ch as char);
525 }
526 }
527
528 while let Some(digit_ch) = self.peek_char()? {
530 if digit_ch.is_ascii_digit() {
531 self.consume_char()?;
532 number_str.push(digit_ch as char);
533 } else {
534 break;
535 }
536 }
537
538 has_dot = true;
540 }
541 }
542
543 if has_dot {
548 let value = number_str
549 .parse::<f64>()
550 .map_err(|_| ParseError::SyntaxError {
551 position: self.position,
552 message: format!("Invalid real number: '{number_str}'"),
553 })?;
554 Ok(Token::Real(value))
555 } else {
556 let value = number_str
557 .parse::<i64>()
558 .map_err(|_| ParseError::SyntaxError {
559 position: self.position,
560 message: format!("Invalid integer: '{number_str}'"),
561 })?;
562 Ok(Token::Integer(value))
563 }
564 }
565
566 fn read_keyword(&mut self) -> ParseResult<Token> {
568 let word = self.read_word()?;
569 self.process_keyword(word)
570 }
571
572 fn process_keyword(&self, word: String) -> ParseResult<Token> {
574 match word.as_str() {
575 "stream" => Ok(Token::Stream),
576 "endstream" => Ok(Token::EndStream),
577 "obj" => Ok(Token::Obj),
578 "endobj" => Ok(Token::EndObj),
579 "startxref" => Ok(Token::StartXRef),
580 _ => Err(ParseError::SyntaxError {
581 position: self.position,
582 message: format!("Unknown keyword: {word}"),
583 }),
584 }
585 }
586
587 fn read_word(&mut self) -> ParseResult<String> {
589 let mut word = String::new();
590
591 while let Some(ch) = self.peek_char()? {
592 if ch.is_ascii_whitespace()
593 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
594 {
595 break;
596 }
597 self.consume_char()?;
598 word.push(ch as char);
599 }
600
601 Ok(word)
602 }
603
604 #[allow(dead_code)]
606 fn read_digits(&mut self) -> ParseResult<String> {
607 let mut digits = String::new();
608
609 while let Some(ch) = self.peek_char()? {
610 if ch.is_ascii_digit() {
611 self.consume_char()?;
612 digits.push(ch as char);
613 } else {
614 break;
615 }
616 }
617
618 Ok(digits)
619 }
620
621 pub fn read_newline(&mut self) -> ParseResult<()> {
623 match self.peek_char()? {
624 Some(b'\r') => {
625 self.consume_char()?;
626 if self.peek_char()? == Some(b'\n') {
628 self.consume_char()?;
629 }
630 Ok(())
631 }
632 Some(b'\n') => {
633 self.consume_char()?;
634 Ok(())
635 }
636 _ => Err(ParseError::SyntaxError {
637 position: self.position,
638 message: "Expected newline".to_string(),
639 }),
640 }
641 }
642
643 pub fn peek_byte(&mut self) -> ParseResult<u8> {
646 match self.peek_char()? {
647 Some(b) => Ok(b),
648 None => Err(ParseError::UnexpectedToken {
649 expected: "byte".to_string(),
650 found: "EOF".to_string(),
651 }),
652 }
653 }
654
655 pub fn read_byte(&mut self) -> ParseResult<u8> {
657 match self.consume_char()? {
658 Some(b) => Ok(b),
659 None => Err(ParseError::UnexpectedToken {
660 expected: "byte".to_string(),
661 found: "EOF".to_string(),
662 }),
663 }
664 }
665
666 pub fn seek(&mut self, pos: u64) -> ParseResult<()>
668 where
669 R: Seek,
670 {
671 self.reader.seek(SeekFrom::Start(pos))?;
672 self.position = pos as usize;
673 Ok(())
674 }
675
676 pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
677 let mut bytes = Vec::with_capacity(n);
678
679 if self.peek_buffer.is_some() && n > 0 {
681 if let Some(byte) = self.consume_char()? {
682 bytes.push(byte);
683 }
684 }
685
686 let remaining = n - bytes.len();
688 if remaining > 0 {
689 let mut rest = vec![0u8; remaining];
690 self.reader.read_exact(&mut rest)?;
691 self.position += remaining;
692 bytes.extend_from_slice(&rest);
693 }
694
695 Ok(bytes)
696 }
697
698 pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
700 let mut result = Vec::new();
701 let mut match_pos = 0;
702
703 while let Some(ch) = self.consume_char()? {
704 result.push(ch);
705
706 if ch == sequence[match_pos] {
707 match_pos += 1;
708 if match_pos == sequence.len() {
709 result.truncate(result.len() - sequence.len());
711 break;
712 }
713 } else if ch == sequence[0] {
714 match_pos = 1;
715 } else {
716 match_pos = 0;
717 }
718 }
719
720 if match_pos < sequence.len() {
721 return Err(ParseError::SyntaxError {
722 position: self.position,
723 message: format!("Sequence {sequence:?} not found"),
724 });
725 }
726
727 Ok(result)
728 }
729
730 pub fn position(&self) -> usize {
732 self.position
733 }
734
735 pub fn push_token(&mut self, token: Token) {
737 self.token_buffer.push(token);
738 }
739
740 pub fn expect_keyword(&mut self, keyword: &str) -> ParseResult<()> {
742 let token = self.next_token()?;
743 match (keyword, &token) {
744 ("endstream", Token::EndStream) => Ok(()),
745 ("stream", Token::Stream) => Ok(()),
746 ("endobj", Token::EndObj) => Ok(()),
747 ("obj", Token::Obj) => Ok(()),
748 ("startxref", Token::StartXRef) => Ok(()),
749 _ => Err(ParseError::UnexpectedToken {
750 expected: format!("keyword '{keyword}'"),
751 found: format!("{token:?}"),
752 }),
753 }
754 }
755
756 pub fn find_keyword_ahead(
759 &mut self,
760 keyword: &str,
761 max_bytes: usize,
762 ) -> ParseResult<Option<usize>>
763 where
764 R: Seek,
765 {
766 use std::io::{Read, Seek, SeekFrom};
767
768 let current_pos = self.reader.stream_position()?;
770 let start_buffer_state = self.peek_buffer;
771
772 let keyword_bytes = keyword.as_bytes();
773 let mut bytes_read = 0;
774 let mut match_buffer = Vec::new();
775
776 while bytes_read < max_bytes {
778 let mut byte = [0u8; 1];
779 match self.reader.read_exact(&mut byte) {
780 Ok(_) => {
781 bytes_read += 1;
782 match_buffer.push(byte[0]);
783
784 if match_buffer.len() > keyword_bytes.len() {
786 match_buffer.remove(0);
787 }
788
789 if match_buffer.len() == keyword_bytes.len() && match_buffer == keyword_bytes {
791 self.reader.seek(SeekFrom::Start(current_pos))?;
793 self.peek_buffer = start_buffer_state;
794 return Ok(Some(bytes_read - keyword_bytes.len()));
795 }
796 }
797 Err(_) => break, }
799 }
800
801 self.reader.seek(SeekFrom::Start(current_pos))?;
803 self.peek_buffer = start_buffer_state;
804 Ok(None)
805 }
806
807 pub fn peek_ahead(&mut self, n: usize) -> ParseResult<Vec<u8>>
809 where
810 R: Seek,
811 {
812 use std::io::{Read, Seek, SeekFrom};
813
814 let current_pos = self.reader.stream_position()?;
816 let start_buffer_state = self.peek_buffer;
817
818 let mut bytes = vec![0u8; n];
820 let bytes_read = self.reader.read(&mut bytes)?;
821 bytes.truncate(bytes_read);
822
823 self.reader.seek(SeekFrom::Start(current_pos))?;
825 self.peek_buffer = start_buffer_state;
826
827 Ok(bytes)
828 }
829
830 pub fn save_position(&mut self) -> ParseResult<(u64, Option<u8>)>
832 where
833 R: Seek,
834 {
835 use std::io::Seek;
836 let pos = self.reader.stream_position()?;
837 Ok((pos, self.peek_buffer))
838 }
839
840 pub fn restore_position(&mut self, saved: (u64, Option<u8>)) -> ParseResult<()>
842 where
843 R: Seek,
844 {
845 use std::io::{Seek, SeekFrom};
846 self.reader.seek(SeekFrom::Start(saved.0))?;
847 self.peek_buffer = saved.1;
848 self.position = saved.0 as usize;
849 Ok(())
850 }
851
852 pub fn peek_token(&mut self) -> ParseResult<Token>
854 where
855 R: Seek,
856 {
857 let saved_pos = self.save_position()?;
858 let token = self.next_token()?;
859 self.restore_position(saved_pos)?;
860 Ok(token)
861 }
862
863 fn process_string_with_encoding_recovery(
865 &mut self,
866 string_bytes: &[u8],
867 ) -> ParseResult<Vec<u8>> {
868 use super::encoding::{CharacterDecoder, EncodingOptions, EncodingType, EnhancedDecoder};
869
870 let has_problematic_chars = string_bytes.iter().any(|&b| {
872 (0x80..=0x9F).contains(&b)
874 || b == 0x07
875 || (b <= 0x1F && b != 0x09 && b != 0x0A && b != 0x0D)
876 });
877
878 let decoder = EnhancedDecoder::new();
879
880 let encoding_options = if has_problematic_chars {
882 EncodingOptions {
883 lenient_mode: true, preferred_encoding: Some(EncodingType::Windows1252), max_replacements: std::cmp::max(100, string_bytes.len() / 10), log_issues: self.options.collect_warnings,
887 }
888 } else {
889 EncodingOptions {
890 lenient_mode: self.options.lenient_encoding,
891 preferred_encoding: self.options.preferred_encoding,
892 max_replacements: 50,
893 log_issues: self.options.collect_warnings,
894 }
895 };
896
897 match decoder.decode(string_bytes, &encoding_options) {
898 Ok(result) => {
899 if (result.replacement_count > 0 || has_problematic_chars)
901 && self.options.collect_warnings
902 {
903 self.warnings.push(ParseWarning::InvalidEncoding {
904 position: self.position,
905 recovered_text: if result.text.len() > 50 {
906 let truncate_at = result
908 .text
909 .char_indices()
910 .map(|(i, _)| i)
911 .nth(47)
912 .unwrap_or_else(|| {
913 let limit = result.text.len().min(47);
915 let mut pos = limit;
916 while pos > 0 && !result.text.is_char_boundary(pos) {
917 pos -= 1;
918 }
919 pos
920 });
921
922 let safe_text = if truncate_at <= result.text.len()
924 && result.text.is_char_boundary(truncate_at)
925 {
926 result.text[..truncate_at].to_string()
927 } else {
928 result.text.chars().take(47).collect::<String>()
930 };
931
932 format!(
933 "{}... (truncated, {} chars total)",
934 safe_text,
935 result.text.chars().count()
936 )
937 } else {
938 result.text.clone()
939 },
940 encoding_used: result.detected_encoding,
941 replacement_count: result.replacement_count,
942 });
943 }
944
945 Ok(result.text.into_bytes())
947 }
948 Err(encoding_error) => {
949 if self.options.lenient_encoding {
950 let fallback_result = self.apply_fallback_encoding_strategy(string_bytes);
952
953 if self.options.collect_warnings {
954 self.warnings.push(ParseWarning::InvalidEncoding {
955 position: self.position,
956 recovered_text: format!(
957 "Fallback strategy applied: {} -> {} chars",
958 string_bytes.len(),
959 fallback_result.len()
960 ),
961 encoding_used: None,
962 replacement_count: string_bytes.len(),
963 });
964 }
965 Ok(fallback_result)
966 } else {
967 Err(ParseError::CharacterEncodingError {
968 position: self.position,
969 message: format!(
970 "Failed to decode string with any supported encoding: {encoding_error}"
971 ),
972 })
973 }
974 }
975 }
976 }
977
978 fn apply_fallback_encoding_strategy(&self, string_bytes: &[u8]) -> Vec<u8> {
980 let mut result = Vec::with_capacity(string_bytes.len());
981
982 for &byte in string_bytes {
983 match byte {
984 0x00..=0x08 | 0x0B | 0x0C | 0x0E..=0x1F => {
986 result.push(b' '); }
988 0x80..=0x9F => {
989 let replacement = match byte {
991 0x80 => b'E', 0x81 => b' ', 0x82 => b',', 0x83 => b'f', 0x84 => b'"', 0x85 => b'.', 0x86 => b'+', 0x87 => b'+', 0x88 => b'^', 0x89 => b'%', 0x8A => b'S', 0x8B => b'<', 0x8C => b'O', 0x8D => b' ', 0x8E => b'Z', 0x8F => b' ', 0x90 => b' ', 0x91 => b'\'', 0x92 => b'\'', 0x93 => b'"', 0x94 => b'"', 0x95 => b'*', 0x96 => b'-', 0x97 => b'-', 0x98 => b'~', 0x99 => b'T', 0x9A => b's', 0x9B => b'>', 0x9C => b'o', 0x9D => b' ', 0x9E => b'z', 0x9F => b'Y', _ => b'?', };
1025 result.push(replacement);
1026 }
1027 _ => {
1028 result.push(byte); }
1030 }
1031 }
1032
1033 result
1034 }
1035
1036 fn is_problematic_encoding_char(&self, ch: u8) -> bool {
1038 (0x80..=0x9F).contains(&ch) ||
1040 ch == 0x07 || (ch <= 0x1F && ch != 0x09 && ch != 0x0A && ch != 0x0D) || (self.options.lenient_syntax && ch >= 0xA0) }
1045
1046 fn handle_encoding_char_in_token_stream(&mut self, ch: u8) -> ParseResult<Token> {
1048 if self.options.lenient_encoding {
1049 self.consume_char()?;
1051
1052 if self.options.collect_warnings {
1054 let replacement_char = match ch {
1055 0x07 => "bell",
1056 0x00..=0x1F => "control",
1057 0x80..=0x9F => "latin1-supplement",
1058 _ => "unknown",
1059 };
1060
1061 self.warnings.push(ParseWarning::InvalidEncoding {
1062 position: self.position,
1063 recovered_text: format!(
1064 "Skipped problematic {replacement_char} character (0x{ch:02X})"
1065 ),
1066 encoding_used: None,
1067 replacement_count: 1,
1068 });
1069 }
1070
1071 self.skip_whitespace()?;
1073 if let Ok(Some(_)) = self.peek_char() {
1074 self.next_token() } else {
1076 Err(ParseError::SyntaxError {
1077 position: self.position,
1078 message: "Unexpected end of file after problematic character".to_string(),
1079 })
1080 }
1081 } else {
1082 let char_description = match ch {
1084 0x07 => "Bell character (\\u{07})".to_string(),
1085 0x00..=0x1F => format!("Control character (\\u{{{ch:02X}}})"),
1086 0x80..=0x9F => format!("Latin-1 supplement character (\\u{{{ch:02X}}})"),
1087 _ => format!("Problematic character (\\u{{{ch:02X}}})"),
1088 };
1089
1090 Err(ParseError::CharacterEncodingError {
1091 position: self.position,
1092 message: format!(
1093 "Unexpected character: {char_description} - Consider using lenient parsing mode"
1094 ),
1095 })
1096 }
1097 }
1098}
1099
1100#[cfg(test)]
1101mod tests {
1102 use super::*;
1103 use std::io::Cursor;
1104
1105 #[test]
1106 fn test_lexer_basic_tokens() {
1107 let input = b"123 -456 3.14 true false null /Name";
1109 let mut lexer = Lexer::new(Cursor::new(input));
1110
1111 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1112 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
1113 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1114 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1115 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1116 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1117 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1118 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1119 }
1120
1121 #[test]
1122 fn test_lexer_negative_numbers() {
1123 let input = b"-123 -45.67";
1125 let mut lexer = Lexer::new(Cursor::new(input));
1126
1127 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
1128 assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
1129 }
1130
1131 #[test]
1132 fn test_lexer_strings() {
1133 let input = b"(Hello World) <48656C6C6F>";
1134 let mut lexer = Lexer::new(Cursor::new(input));
1135
1136 assert_eq!(
1137 lexer.next_token().unwrap(),
1138 Token::String(b"Hello World".to_vec())
1139 );
1140 assert_eq!(
1141 lexer.next_token().unwrap(),
1142 Token::String(b"Hello".to_vec())
1143 );
1144 }
1145
1146 #[test]
1147 fn test_lexer_dictionaries() {
1148 let input = b"<< /Type /Page >>";
1149 let mut lexer = Lexer::new(Cursor::new(input));
1150
1151 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1152 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1153 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1154 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1155 }
1156
1157 #[test]
1158 fn test_lexer_arrays() {
1159 let input = b"[1 2 3]";
1160 let mut lexer = Lexer::new(Cursor::new(input));
1161
1162 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1163 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1164 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1165 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1166 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1167 }
1168
1169 #[test]
1170 fn test_lexer_references() {
1171 let input = b"1 0 R 25 1 R";
1172 let mut lexer = Lexer::new(Cursor::new(input));
1173
1174 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1176 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1177 match lexer.next_token().unwrap() {
1179 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1181 }
1182
1183 assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
1184 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1185 match lexer.next_token().unwrap() {
1186 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
1188 }
1189 }
1190
1191 #[test]
1192 fn test_lexer_comments() {
1193 let input = b"%PDF-1.7\n123";
1194 let mut lexer = Lexer::new(Cursor::new(input));
1195
1196 assert_eq!(
1197 lexer.next_token().unwrap(),
1198 Token::Comment("PDF-1.7".to_string())
1199 );
1200 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1201 }
1202
1203 mod comprehensive_tests {
1205 use super::*;
1206 use std::io::Cursor;
1207
1208 #[test]
1209 fn test_token_debug_trait() {
1210 let token = Token::Integer(42);
1211 let debug_str = format!("{token:?}");
1212 assert!(debug_str.contains("Integer"));
1213 assert!(debug_str.contains("42"));
1214 }
1215
1216 #[test]
1217 fn test_token_clone() {
1218 let token = Token::String(b"hello".to_vec());
1219 let cloned = token.clone();
1220 assert_eq!(token, cloned);
1221 }
1222
1223 #[test]
1224 fn test_token_equality() {
1225 assert_eq!(Token::Integer(42), Token::Integer(42));
1226 assert_ne!(Token::Integer(42), Token::Integer(43));
1227 assert_eq!(Token::Boolean(true), Token::Boolean(true));
1228 assert_ne!(Token::Boolean(true), Token::Boolean(false));
1229 assert_eq!(Token::Null, Token::Null);
1230 assert_ne!(Token::Null, Token::Integer(0));
1231 }
1232
1233 #[test]
1234 fn test_lexer_empty_input() {
1235 let input = b"";
1236 let mut lexer = Lexer::new(Cursor::new(input));
1237 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1238 }
1239
1240 #[test]
1241 fn test_lexer_whitespace_only() {
1242 let input = b" \t\n\r ";
1243 let mut lexer = Lexer::new(Cursor::new(input));
1244 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1245 }
1246
1247 #[test]
1248 fn test_lexer_integer_edge_cases() {
1249 let input = b"0 +123 -0 9876543210";
1250 let mut lexer = Lexer::new(Cursor::new(input));
1251
1252 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1253 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1254 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1255 assert_eq!(lexer.next_token().unwrap(), Token::Integer(9876543210));
1256 }
1257
1258 #[test]
1259 fn test_lexer_real_edge_cases() {
1260 let input = b"0.0 +3.14 -2.71828 .5 5. 123.456789";
1261 let mut lexer = Lexer::new(Cursor::new(input));
1262
1263 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.0));
1264 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
1265 assert_eq!(lexer.next_token().unwrap(), Token::Real(-2.71828));
1266 assert_eq!(lexer.next_token().unwrap(), Token::Real(0.5));
1267 assert_eq!(lexer.next_token().unwrap(), Token::Real(5.0));
1268 assert_eq!(lexer.next_token().unwrap(), Token::Real(123.456789));
1269 }
1270
1271 #[test]
1272 fn test_lexer_scientific_notation() {
1273 let input = b"1.23e10 -4.56E-5 1e0 2E+3";
1274 let mut lexer = Lexer::new(Cursor::new(input));
1275
1276 assert_eq!(lexer.next_token().unwrap(), Token::Real(1.23e10));
1277 assert_eq!(lexer.next_token().unwrap(), Token::Real(-4.56e-5));
1278 assert_eq!(lexer.next_token().unwrap(), Token::Real(1e0));
1279 assert_eq!(lexer.next_token().unwrap(), Token::Real(2e3));
1280 }
1281
1282 #[test]
1283 fn test_lexer_string_literal_escapes() {
1284 let input = b"(Hello\\nWorld) (Tab\\tChar) (Quote\\\"Mark) (Backslash\\\\)";
1285 let mut lexer = Lexer::new(Cursor::new(input));
1286
1287 assert_eq!(
1288 lexer.next_token().unwrap(),
1289 Token::String(b"Hello\nWorld".to_vec())
1290 );
1291 assert_eq!(
1292 lexer.next_token().unwrap(),
1293 Token::String(b"Tab\tChar".to_vec())
1294 );
1295 assert_eq!(
1296 lexer.next_token().unwrap(),
1297 Token::String(b"Quote\"Mark".to_vec())
1298 );
1299 assert_eq!(
1300 lexer.next_token().unwrap(),
1301 Token::String(b"Backslash\\".to_vec())
1302 );
1303 }
1304
1305 #[test]
1306 fn test_lexer_string_literal_nested_parens() {
1307 let input = b"(Nested (parentheses) work)";
1308 let mut lexer = Lexer::new(Cursor::new(input));
1309
1310 assert_eq!(
1311 lexer.next_token().unwrap(),
1312 Token::String(b"Nested (parentheses) work".to_vec())
1313 );
1314 }
1315
1316 #[test]
1317 fn test_lexer_string_literal_empty() {
1318 let input = b"()";
1319 let mut lexer = Lexer::new(Cursor::new(input));
1320
1321 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1322 }
1323
1324 #[test]
1325 fn test_lexer_hexadecimal_strings() {
1326 let input = b"<48656C6C6F> <20576F726C64> <>";
1327 let mut lexer = Lexer::new(Cursor::new(input));
1328
1329 assert_eq!(
1330 lexer.next_token().unwrap(),
1331 Token::String(b"Hello".to_vec())
1332 );
1333 assert_eq!(
1334 lexer.next_token().unwrap(),
1335 Token::String(b" World".to_vec())
1336 );
1337 assert_eq!(lexer.next_token().unwrap(), Token::String(b"".to_vec()));
1338 }
1339
1340 #[test]
1341 fn test_lexer_hexadecimal_strings_odd_length() {
1342 let input = b"<48656C6C6F2> <1> <ABC>";
1343 let mut lexer = Lexer::new(Cursor::new(input));
1344
1345 assert_eq!(
1347 lexer.next_token().unwrap(),
1348 Token::String(b"Hello ".to_vec())
1349 );
1350 assert_eq!(lexer.next_token().unwrap(), Token::String(b"\x10".to_vec()));
1351 assert_eq!(
1352 lexer.next_token().unwrap(),
1353 Token::String(b"\xAB\xC0".to_vec())
1354 );
1355 }
1356
1357 #[test]
1358 fn test_lexer_hexadecimal_strings_whitespace() {
1359 let input = b"<48 65 6C 6C 6F>";
1360 let mut lexer = Lexer::new(Cursor::new(input));
1361
1362 assert_eq!(
1363 lexer.next_token().unwrap(),
1364 Token::String(b"Hello".to_vec())
1365 );
1366 }
1367
1368 #[test]
1369 fn test_lexer_names() {
1370 let input = b"/Type /Page /Root /Kids /Count /MediaBox";
1371 let mut lexer = Lexer::new(Cursor::new(input));
1372
1373 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1374 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1375 assert_eq!(lexer.next_token().unwrap(), Token::Name("Root".to_string()));
1376 assert_eq!(lexer.next_token().unwrap(), Token::Name("Kids".to_string()));
1377 assert_eq!(
1378 lexer.next_token().unwrap(),
1379 Token::Name("Count".to_string())
1380 );
1381 assert_eq!(
1382 lexer.next_token().unwrap(),
1383 Token::Name("MediaBox".to_string())
1384 );
1385 }
1386
1387 #[test]
1388 fn test_lexer_names_with_special_chars() {
1389 let input = b"/Name#20with#20spaces /Name#2Fwith#2Fslashes";
1390 let mut lexer = Lexer::new(Cursor::new(input));
1391
1392 assert_eq!(
1393 lexer.next_token().unwrap(),
1394 Token::Name("Name with spaces".to_string())
1395 );
1396 assert_eq!(
1397 lexer.next_token().unwrap(),
1398 Token::Name("Name/with/slashes".to_string())
1399 );
1400 }
1401
1402 #[test]
1403 fn test_lexer_names_edge_cases() {
1404 let input = b"/ /A /123 /true /false /null";
1405 let mut lexer = Lexer::new(Cursor::new(input));
1406
1407 assert_eq!(lexer.next_token().unwrap(), Token::Name("".to_string()));
1408 assert_eq!(lexer.next_token().unwrap(), Token::Name("A".to_string()));
1409 assert_eq!(lexer.next_token().unwrap(), Token::Name("123".to_string()));
1410 assert_eq!(lexer.next_token().unwrap(), Token::Name("true".to_string()));
1411 assert_eq!(
1412 lexer.next_token().unwrap(),
1413 Token::Name("false".to_string())
1414 );
1415 assert_eq!(lexer.next_token().unwrap(), Token::Name("null".to_string()));
1416 }
1417
1418 #[test]
1419 fn test_lexer_nested_dictionaries() {
1420 let input = b"<< /Type /Page /Resources << /Font << /F1 123 0 R >> >> >>";
1421 let mut lexer = Lexer::new(Cursor::new(input));
1422
1423 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1424 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
1425 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
1426 assert_eq!(
1427 lexer.next_token().unwrap(),
1428 Token::Name("Resources".to_string())
1429 );
1430 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1431 assert_eq!(lexer.next_token().unwrap(), Token::Name("Font".to_string()));
1432 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1433 assert_eq!(lexer.next_token().unwrap(), Token::Name("F1".to_string()));
1434 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1435 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
1436 assert_eq!(lexer.next_token().unwrap(), Token::Name("R".to_string()));
1437 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1438 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1439 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1440 }
1441
1442 #[test]
1443 fn test_lexer_nested_arrays() {
1444 let input = b"[[1 2] [3 4] [5 [6 7]]]";
1445 let mut lexer = Lexer::new(Cursor::new(input));
1446
1447 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1448 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1449 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
1450 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
1451 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1452 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1453 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
1454 assert_eq!(lexer.next_token().unwrap(), Token::Integer(4));
1455 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1456 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1457 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1458 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
1459 assert_eq!(lexer.next_token().unwrap(), Token::Integer(6));
1460 assert_eq!(lexer.next_token().unwrap(), Token::Integer(7));
1461 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1462 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1463 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
1464 }
1465
1466 #[test]
1467 fn test_lexer_mixed_content() {
1468 let input = b"<< /Type /Page /MediaBox [0 0 612 792] /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 >> >> >> >>";
1469 let mut lexer = Lexer::new(Cursor::new(input));
1470
1471 let mut tokens = Vec::new();
1473 loop {
1474 match lexer.next_token().unwrap() {
1475 Token::Eof => break,
1476 token => tokens.push(token),
1477 }
1478 }
1479 assert!(tokens.len() > 10);
1480 }
1481
1482 #[test]
1483 fn test_lexer_keywords() {
1484 let input = b"obj endobj stream endstream startxref";
1485 let mut lexer = Lexer::new(Cursor::new(input));
1486
1487 assert_eq!(lexer.next_token().unwrap(), Token::Obj);
1488 assert_eq!(lexer.next_token().unwrap(), Token::EndObj);
1489 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1490 assert_eq!(lexer.next_token().unwrap(), Token::EndStream);
1491 assert_eq!(lexer.next_token().unwrap(), Token::StartXRef);
1492 }
1493
1494 #[test]
1495 fn test_lexer_multiple_comments() {
1496 let input = b"%First comment\n%Second comment\n123";
1497 let mut lexer = Lexer::new(Cursor::new(input));
1498
1499 assert_eq!(
1500 lexer.next_token().unwrap(),
1501 Token::Comment("First comment".to_string())
1502 );
1503 assert_eq!(
1504 lexer.next_token().unwrap(),
1505 Token::Comment("Second comment".to_string())
1506 );
1507 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1508 }
1509
1510 #[test]
1511 fn test_lexer_comment_without_newline() {
1512 let input = b"%Comment at end";
1513 let mut lexer = Lexer::new(Cursor::new(input));
1514
1515 assert_eq!(
1516 lexer.next_token().unwrap(),
1517 Token::Comment("Comment at end".to_string())
1518 );
1519 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1520 }
1521
1522 #[test]
1523 fn test_lexer_special_characters_in_streams() {
1524 let input = b"<< /Length 5 >> stream\nHello endstream";
1525 let mut lexer = Lexer::new(Cursor::new(input));
1526
1527 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
1528 assert_eq!(
1529 lexer.next_token().unwrap(),
1530 Token::Name("Length".to_string())
1531 );
1532 assert_eq!(lexer.next_token().unwrap(), Token::Integer(5));
1533 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
1534 assert_eq!(lexer.next_token().unwrap(), Token::Stream);
1535 }
1537
1538 #[test]
1539 fn test_lexer_push_token() {
1540 let input = b"123 456";
1541 let mut lexer = Lexer::new(Cursor::new(input));
1542
1543 let token1 = lexer.next_token().unwrap();
1544 assert_eq!(token1, Token::Integer(123));
1545
1546 let token2 = lexer.next_token().unwrap();
1547 assert_eq!(token2, Token::Integer(456));
1548
1549 lexer.push_token(token2.clone());
1551
1552 let token3 = lexer.next_token().unwrap();
1554 assert_eq!(token3, token2);
1555
1556 let token4 = lexer.next_token().unwrap();
1558 assert_eq!(token4, Token::Eof);
1559 }
1560
1561 #[test]
1562 fn test_lexer_push_multiple_tokens() {
1563 let input = b"123";
1564 let mut lexer = Lexer::new(Cursor::new(input));
1565
1566 let original_token = lexer.next_token().unwrap();
1567 assert_eq!(original_token, Token::Integer(123));
1568
1569 lexer.push_token(Token::Boolean(true));
1571 lexer.push_token(Token::Boolean(false));
1572 lexer.push_token(Token::Null);
1573
1574 assert_eq!(lexer.next_token().unwrap(), Token::Null);
1576 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
1577 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
1578 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
1579 }
1580
1581 #[test]
1582 fn test_lexer_read_newline() {
1583 let input = b"123\n456\r\n789";
1584 let mut lexer = Lexer::new(Cursor::new(input));
1585
1586 let digits1 = lexer.read_digits().unwrap();
1588 assert_eq!(digits1, "123");
1589 assert!(lexer.read_newline().is_ok());
1590
1591 let digits2 = lexer.read_digits().unwrap();
1593 assert_eq!(digits2, "456");
1594 assert!(lexer.read_newline().is_ok());
1595
1596 let digits3 = lexer.read_digits().unwrap();
1598 assert_eq!(digits3, "789");
1599 }
1600
1601 #[test]
1602 fn test_lexer_read_bytes() {
1603 let input = b"Hello World";
1604 let mut lexer = Lexer::new(Cursor::new(input));
1605
1606 let bytes = lexer.read_bytes(5).unwrap();
1607 assert_eq!(bytes, b"Hello");
1608
1609 let bytes = lexer.read_bytes(6).unwrap();
1610 assert_eq!(bytes, b" World");
1611 }
1612
1613 #[test]
1614 fn test_lexer_read_until_sequence() {
1615 let input = b"Hello endstream World";
1616 let mut lexer = Lexer::new(Cursor::new(input));
1617
1618 let result = lexer.read_until_sequence(b"endstream").unwrap();
1619 assert_eq!(result, b"Hello ");
1620
1621 let rest = lexer.read_digits().unwrap();
1623 assert_eq!(rest, ""); }
1625
1626 #[test]
1627 fn test_lexer_read_until_sequence_not_found() {
1628 let input = b"Hello World";
1629 let mut lexer = Lexer::new(Cursor::new(input));
1630
1631 let result = lexer.read_until_sequence(b"notfound");
1632 assert!(result.is_err());
1633 }
1634
1635 #[test]
1636 fn test_lexer_position_tracking() {
1637 let input = b"123 456";
1638 let mut lexer = Lexer::new(Cursor::new(input));
1639
1640 let initial_pos = lexer.position();
1641 assert_eq!(initial_pos, 0);
1642
1643 lexer.next_token().unwrap(); let pos_after_first = lexer.position();
1645 assert!(pos_after_first > initial_pos);
1646
1647 lexer.next_token().unwrap(); let pos_after_second = lexer.position();
1649 assert!(pos_after_second > pos_after_first);
1650 }
1651
1652 #[test]
1653 fn test_lexer_large_numbers() {
1654 let input = b"2147483647 -2147483648 9223372036854775807 -9223372036854775808";
1655 let mut lexer = Lexer::new(Cursor::new(input));
1656
1657 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2147483647));
1658 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-2147483648));
1659 assert_eq!(
1660 lexer.next_token().unwrap(),
1661 Token::Integer(9223372036854775807)
1662 );
1663 assert_eq!(
1664 lexer.next_token().unwrap(),
1665 Token::Integer(-9223372036854775808)
1666 );
1667 }
1668
1669 #[test]
1670 fn test_lexer_very_long_string() {
1671 let long_str = "A".repeat(1000);
1672 let input = format!("({long_str})");
1673 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1674
1675 if let Token::String(s) = lexer.next_token().unwrap() {
1676 assert_eq!(s.len(), 1000);
1677 assert_eq!(s, long_str.as_bytes());
1678 } else {
1679 panic!("Expected string token");
1680 }
1681 }
1682
1683 #[test]
1684 fn test_lexer_very_long_name() {
1685 let long_name = "A".repeat(500);
1686 let input = format!("/{long_name}");
1687 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1688
1689 if let Token::Name(name) = lexer.next_token().unwrap() {
1690 assert_eq!(name.len(), 500);
1691 assert_eq!(name, long_name);
1692 } else {
1693 panic!("Expected name token");
1694 }
1695 }
1696
1697 #[test]
1698 fn test_lexer_error_handling_invalid_hex() {
1699 let input = b"<48656C6C6FG>";
1700 let mut lexer = Lexer::new(Cursor::new(input));
1701
1702 let result = lexer.next_token();
1704 assert!(result.is_ok() || result.is_err()); }
1706
1707 #[test]
1708 fn test_lexer_all_token_types() {
1709 let input = b"true false null 123 -456 3.14 (string) <48656C6C6F> /Name [ ] << >> obj endobj stream endstream startxref % comment\n";
1710 let mut lexer = Lexer::new(Cursor::new(input));
1711
1712 let mut token_types = Vec::new();
1713 loop {
1714 match lexer.next_token().unwrap() {
1715 Token::Eof => break,
1716 token => token_types.push(std::mem::discriminant(&token)),
1717 }
1718 }
1719
1720 assert!(token_types.len() > 10);
1722 }
1723
1724 #[test]
1725 fn test_lexer_performance() {
1726 let input = "123 456 789 ".repeat(1000);
1727 let mut lexer = Lexer::new(Cursor::new(input.as_bytes()));
1728
1729 let start_time = std::time::Instant::now();
1730 let mut count = 0;
1731 loop {
1732 match lexer.next_token().unwrap() {
1733 Token::Eof => break,
1734 _ => count += 1,
1735 }
1736 }
1737 let elapsed = start_time.elapsed();
1738
1739 assert_eq!(count, 3000); assert!(elapsed.as_millis() < 1000); }
1742 }
1743
1744 #[test]
1745 fn test_lexer_find_keyword_ahead() {
1746 let input = b"some data here endstream more data";
1747 let mut lexer = Lexer::new(Cursor::new(input));
1748
1749 let result = lexer.find_keyword_ahead("endstream", 100);
1751 assert!(result.is_ok());
1752 assert_eq!(result.unwrap(), Some(15)); let result2 = lexer.find_keyword_ahead("notfound", 100);
1756 assert!(result2.is_ok());
1757 assert_eq!(result2.unwrap(), None);
1758
1759 let result3 = lexer.find_keyword_ahead("endstream", 10);
1761 assert!(result3.is_ok());
1762 assert_eq!(result3.unwrap(), None); }
1764
1765 #[test]
1766 fn test_lexer_peek_token() {
1767 let input = b"123 456 /Name";
1768 let mut lexer = Lexer::new(Cursor::new(input));
1769
1770 let peeked = lexer.peek_token();
1772 assert!(peeked.is_ok());
1773 assert_eq!(peeked.unwrap(), Token::Integer(123));
1774
1775 let next = lexer.next_token();
1777 assert!(next.is_ok());
1778 assert_eq!(next.unwrap(), Token::Integer(123));
1779
1780 assert_eq!(lexer.peek_token().unwrap(), Token::Integer(456));
1782 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1783
1784 assert_eq!(lexer.peek_token().unwrap(), Token::Name("Name".to_string()));
1785 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
1786 }
1787
1788 #[test]
1789 fn test_lexer_expect_keyword() {
1790 let input = b"endstream obj endobj";
1791 let mut lexer = Lexer::new(Cursor::new(input));
1792
1793 assert!(lexer.expect_keyword("endstream").is_ok());
1795
1796 assert!(lexer.expect_keyword("obj").is_ok());
1798
1799 let result = lexer.expect_keyword("stream");
1801 assert!(result.is_err());
1802 match result {
1803 Err(ParseError::UnexpectedToken { expected, found }) => {
1804 assert!(expected.contains("stream"));
1805 assert!(found.contains("EndObj"));
1806 }
1807 _ => panic!("Expected UnexpectedToken error"),
1808 }
1809 }
1810
1811 #[test]
1812 fn test_lexer_save_restore_position() {
1813 let input = b"123 456 789";
1814 let mut lexer = Lexer::new(Cursor::new(input));
1815
1816 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
1818
1819 let saved = lexer.save_position();
1821 assert!(saved.is_ok());
1822 let saved_pos = saved.unwrap();
1823
1824 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1826 assert_eq!(lexer.next_token().unwrap(), Token::Integer(789));
1827
1828 assert!(lexer.restore_position(saved_pos).is_ok());
1830
1831 assert_eq!(lexer.next_token().unwrap(), Token::Integer(456));
1833 }
1834
1835 #[test]
1836 fn test_lexer_character_encoding_recovery() {
1837 let input = b"(Caf\x80 \x91Hello\x92)"; let options = ParseOptions::lenient();
1840 let mut lexer = Lexer::new_with_options(Cursor::new(input), options);
1841
1842 match lexer.next_token().unwrap() {
1843 Token::String(bytes) => {
1844 let text = String::from_utf8_lossy(&bytes);
1846 println!("Recovered text: {text}");
1847 assert!(!text.is_empty()); }
1849 other => panic!("Expected String token, got {other:?}"),
1850 }
1851
1852 let warnings = lexer.warnings();
1854 if !warnings.is_empty() {
1855 println!("Encoding warnings: {warnings:?}");
1856 }
1857 }
1858}