1use super::{ParseError, ParseResult};
6use std::io::Read;
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Boolean(bool),
13
14 Integer(i64),
16
17 Real(f64),
19
20 String(Vec<u8>),
22
23 Name(String),
25
26 ArrayStart,
28
29 ArrayEnd,
31
32 DictStart,
34
35 DictEnd,
37
38 Stream,
40
41 EndStream,
43
44 Obj,
46
47 EndObj,
49
50 StartXRef,
52
53 Reference(u32, u16),
55
56 Null,
58
59 Comment(String),
61
62 Eof,
64}
65
66pub struct Lexer<R: Read> {
68 reader: std::io::BufReader<R>,
69 #[allow(dead_code)]
70 buffer: Vec<u8>,
71 position: usize,
72 peek_buffer: Option<u8>,
73 token_buffer: Vec<Token>,
74}
75
76impl<R: Read> Lexer<R> {
77 pub fn new(reader: R) -> Self {
79 Self {
80 reader: std::io::BufReader::new(reader),
81 buffer: Vec::with_capacity(1024),
82 position: 0,
83 peek_buffer: None,
84 token_buffer: Vec::new(),
85 }
86 }
87
88 pub fn next_token(&mut self) -> ParseResult<Token> {
90 if let Some(token) = self.token_buffer.pop() {
92 return Ok(token);
93 }
94
95 self.skip_whitespace()?;
96
97 let ch = match self.peek_char()? {
98 Some(ch) => ch,
99 None => return Ok(Token::Eof),
100 };
101
102 match ch {
103 b'%' => self.read_comment(),
104 b'/' => self.read_name(),
105 b'(' => self.read_literal_string(),
106 b'<' => self.read_angle_bracket(),
107 b'>' => {
108 self.consume_char()?;
109 if self.peek_char()? == Some(b'>') {
110 self.consume_char()?;
111 Ok(Token::DictEnd)
112 } else {
113 Err(ParseError::SyntaxError {
114 position: self.position,
115 message: "Expected '>' after '>'".to_string(),
116 })
117 }
118 }
119 b'[' => {
120 self.consume_char()?;
121 Ok(Token::ArrayStart)
122 }
123 b']' => {
124 self.consume_char()?;
125 Ok(Token::ArrayEnd)
126 }
127 b't' | b'f' => self.read_boolean(),
128 b'n' => self.read_null(),
129 b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
130 b'R' => {
131 self.consume_char()?;
133 Ok(Token::Name("R".to_string()))
134 }
135 _ if ch.is_ascii_alphabetic() => self.read_keyword(),
136 _ => Err(ParseError::SyntaxError {
137 position: self.position,
138 message: format!("Unexpected character: {}", ch as char),
139 }),
140 }
141 }
142
143 fn peek_char(&mut self) -> ParseResult<Option<u8>> {
145 if let Some(ch) = self.peek_buffer {
146 return Ok(Some(ch));
147 }
148
149 let mut buf = [0u8; 1];
150 match self.reader.read_exact(&mut buf) {
151 Ok(_) => {
152 self.peek_buffer = Some(buf[0]);
153 Ok(Some(buf[0]))
154 }
155 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
156 Err(e) => Err(e.into()),
157 }
158 }
159
160 fn consume_char(&mut self) -> ParseResult<Option<u8>> {
162 let ch = self.peek_char()?;
163 if ch.is_some() {
164 self.peek_buffer = None;
165 self.position += 1;
166 }
167 Ok(ch)
168 }
169
170 pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
172 let mut count = 0;
173 while let Some(ch) = self.peek_char()? {
174 if ch.is_ascii_whitespace() {
175 self.consume_char()?;
176 count += 1;
177 } else {
178 break;
179 }
180 }
181 Ok(count)
182 }
183
184 fn read_comment(&mut self) -> ParseResult<Token> {
186 self.consume_char()?; let mut comment = String::new();
188
189 while let Some(ch) = self.peek_char()? {
190 if ch == b'\n' || ch == b'\r' {
191 break;
192 }
193 self.consume_char()?;
194 comment.push(ch as char);
195 }
196
197 Ok(Token::Comment(comment))
198 }
199
200 fn read_name(&mut self) -> ParseResult<Token> {
202 self.consume_char()?; let mut name = String::new();
204
205 while let Some(ch) = self.peek_char()? {
206 if ch.is_ascii_whitespace()
207 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
208 {
209 break;
210 }
211 self.consume_char()?;
212
213 if ch == b'#' {
215 let hex1 = self
216 .consume_char()?
217 .ok_or_else(|| ParseError::SyntaxError {
218 position: self.position,
219 message: "Incomplete hex code in name".to_string(),
220 })?;
221 let hex2 = self
222 .consume_char()?
223 .ok_or_else(|| ParseError::SyntaxError {
224 position: self.position,
225 message: "Incomplete hex code in name".to_string(),
226 })?;
227
228 let value = u8::from_str_radix(&format!("{}{}", hex1 as char, hex2 as char), 16)
229 .map_err(|_| ParseError::SyntaxError {
230 position: self.position,
231 message: "Invalid hex code in name".to_string(),
232 })?;
233
234 name.push(value as char);
235 } else {
236 name.push(ch as char);
237 }
238 }
239
240 Ok(Token::Name(name))
241 }
242
243 fn read_literal_string(&mut self) -> ParseResult<Token> {
245 self.consume_char()?; let mut string = Vec::new();
247 let mut paren_depth = 1;
248 let mut escape = false;
249
250 while paren_depth > 0 {
251 let ch = self
252 .consume_char()?
253 .ok_or_else(|| ParseError::SyntaxError {
254 position: self.position,
255 message: "Unterminated string".to_string(),
256 })?;
257
258 if escape {
259 let escaped = match ch {
260 b'n' => b'\n',
261 b'r' => b'\r',
262 b't' => b'\t',
263 b'b' => b'\x08',
264 b'f' => b'\x0C',
265 b'(' => b'(',
266 b')' => b')',
267 b'\\' => b'\\',
268 b'0'..=b'7' => {
269 let mut value = ch - b'0';
271 for _ in 0..2 {
272 if let Some(next) = self.peek_char()? {
273 if matches!(next, b'0'..=b'7') {
274 self.consume_char()?;
275 value = value * 8 + (next - b'0');
276 } else {
277 break;
278 }
279 }
280 }
281 value
282 }
283 _ => ch, };
285 string.push(escaped);
286 escape = false;
287 } else {
288 match ch {
289 b'\\' => escape = true,
290 b'(' => {
291 string.push(ch);
292 paren_depth += 1;
293 }
294 b')' => {
295 paren_depth -= 1;
296 if paren_depth > 0 {
297 string.push(ch);
298 }
299 }
300 _ => string.push(ch),
301 }
302 }
303 }
304
305 Ok(Token::String(string))
306 }
307
308 fn read_angle_bracket(&mut self) -> ParseResult<Token> {
310 self.consume_char()?; if self.peek_char()? == Some(b'<') {
313 self.consume_char()?;
314 Ok(Token::DictStart)
315 } else {
316 let mut hex_chars = String::new();
318 let mut found_end = false;
319
320 while let Some(ch) = self.peek_char()? {
321 if ch == b'>' {
322 self.consume_char()?;
323 found_end = true;
324 break;
325 }
326 self.consume_char()?;
327 if ch.is_ascii_hexdigit() {
328 hex_chars.push(ch as char);
329 } else if !ch.is_ascii_whitespace() {
330 return Err(ParseError::SyntaxError {
331 position: self.position,
332 message: "Invalid character in hex string".to_string(),
333 });
334 }
335 }
336
337 if !found_end {
338 return Err(ParseError::SyntaxError {
339 position: self.position,
340 message: "Unterminated hex string".to_string(),
341 });
342 }
343
344 if hex_chars.len() % 2 != 0 {
346 hex_chars.push('0');
347 }
348
349 let mut bytes = Vec::new();
351 for chunk in hex_chars.as_bytes().chunks(2) {
352 let hex_str = std::str::from_utf8(chunk).unwrap();
353 let byte =
354 u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
355 position: self.position,
356 message: "Invalid hex string".to_string(),
357 })?;
358 bytes.push(byte);
359 }
360
361 Ok(Token::String(bytes))
362 }
363 }
364
365 fn read_boolean(&mut self) -> ParseResult<Token> {
367 let word = self.read_word()?;
368 match word.as_str() {
369 "true" => Ok(Token::Boolean(true)),
370 "false" => Ok(Token::Boolean(false)),
371 _ => {
372 self.process_keyword(word)
374 }
375 }
376 }
377
378 fn read_null(&mut self) -> ParseResult<Token> {
380 let word = self.read_word()?;
381 if word == "null" {
382 Ok(Token::Null)
383 } else {
384 self.process_keyword(word)
386 }
387 }
388
389 fn read_number(&mut self) -> ParseResult<Token> {
391 let mut number_str = String::new();
392 let mut has_dot = false;
393
394 if let Some(ch) = self.peek_char()? {
396 if ch == b'+' || ch == b'-' {
397 self.consume_char()?;
398 number_str.push(ch as char);
399
400 if let Some(next) = self.peek_char()? {
402 if !next.is_ascii_digit() && next != b'.' {
403 return Err(ParseError::SyntaxError {
404 position: self.position,
405 message: "Expected digit after sign".to_string(),
406 });
407 }
408 }
409 }
410 }
411
412 while let Some(ch) = self.peek_char()? {
414 match ch {
415 b'0'..=b'9' => {
416 self.consume_char()?;
417 number_str.push(ch as char);
418 }
419 b'.' if !has_dot => {
420 self.consume_char()?;
421 number_str.push(ch as char);
422 has_dot = true;
423 }
424 _ => break,
425 }
426 }
427
428 if has_dot {
433 let value = number_str
434 .parse::<f64>()
435 .map_err(|_| ParseError::SyntaxError {
436 position: self.position,
437 message: format!("Invalid real number: '{number_str}'"),
438 })?;
439 Ok(Token::Real(value))
440 } else {
441 let value = number_str
442 .parse::<i64>()
443 .map_err(|_| ParseError::SyntaxError {
444 position: self.position,
445 message: format!("Invalid integer: '{number_str}'"),
446 })?;
447 Ok(Token::Integer(value))
448 }
449 }
450
451 fn read_keyword(&mut self) -> ParseResult<Token> {
453 let word = self.read_word()?;
454 self.process_keyword(word)
455 }
456
457 fn process_keyword(&self, word: String) -> ParseResult<Token> {
459 match word.as_str() {
460 "stream" => Ok(Token::Stream),
461 "endstream" => Ok(Token::EndStream),
462 "obj" => Ok(Token::Obj),
463 "endobj" => Ok(Token::EndObj),
464 "startxref" => Ok(Token::StartXRef),
465 _ => Err(ParseError::SyntaxError {
466 position: self.position,
467 message: format!("Unknown keyword: {word}"),
468 }),
469 }
470 }
471
472 fn read_word(&mut self) -> ParseResult<String> {
474 let mut word = String::new();
475
476 while let Some(ch) = self.peek_char()? {
477 if ch.is_ascii_whitespace()
478 || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
479 {
480 break;
481 }
482 self.consume_char()?;
483 word.push(ch as char);
484 }
485
486 Ok(word)
487 }
488
489 #[allow(dead_code)]
491 fn read_digits(&mut self) -> ParseResult<String> {
492 let mut digits = String::new();
493
494 while let Some(ch) = self.peek_char()? {
495 if ch.is_ascii_digit() {
496 self.consume_char()?;
497 digits.push(ch as char);
498 } else {
499 break;
500 }
501 }
502
503 Ok(digits)
504 }
505
506 pub fn read_newline(&mut self) -> ParseResult<()> {
508 match self.peek_char()? {
509 Some(b'\r') => {
510 self.consume_char()?;
511 if self.peek_char()? == Some(b'\n') {
513 self.consume_char()?;
514 }
515 Ok(())
516 }
517 Some(b'\n') => {
518 self.consume_char()?;
519 Ok(())
520 }
521 _ => Err(ParseError::SyntaxError {
522 position: self.position,
523 message: "Expected newline".to_string(),
524 }),
525 }
526 }
527
528 pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
530 let mut bytes = vec![0u8; n];
531 self.reader.read_exact(&mut bytes)?;
532 self.position += n;
533 Ok(bytes)
534 }
535
536 pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
538 let mut result = Vec::new();
539 let mut match_pos = 0;
540
541 while let Some(ch) = self.consume_char()? {
542 result.push(ch);
543
544 if ch == sequence[match_pos] {
545 match_pos += 1;
546 if match_pos == sequence.len() {
547 result.truncate(result.len() - sequence.len());
549 break;
550 }
551 } else if ch == sequence[0] {
552 match_pos = 1;
553 } else {
554 match_pos = 0;
555 }
556 }
557
558 if match_pos < sequence.len() {
559 return Err(ParseError::SyntaxError {
560 position: self.position,
561 message: format!("Sequence {sequence:?} not found"),
562 });
563 }
564
565 Ok(result)
566 }
567
568 pub fn position(&self) -> usize {
570 self.position
571 }
572
573 pub fn push_token(&mut self, token: Token) {
575 self.token_buffer.push(token);
576 }
577}
578
579#[cfg(test)]
580mod tests {
581 use super::*;
582 use std::io::Cursor;
583
584 #[test]
585 fn test_lexer_basic_tokens() {
586 let input = b"123 -456 3.14 true false null /Name";
588 let mut lexer = Lexer::new(Cursor::new(input));
589
590 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
591 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
592 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
593 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
594 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
595 assert_eq!(lexer.next_token().unwrap(), Token::Null);
596 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
597 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
598 }
599
600 #[test]
601 fn test_lexer_negative_numbers() {
602 let input = b"-123 -45.67";
604 let mut lexer = Lexer::new(Cursor::new(input));
605
606 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
607 assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
608 }
609
610 #[test]
611 fn test_lexer_strings() {
612 let input = b"(Hello World) <48656C6C6F>";
613 let mut lexer = Lexer::new(Cursor::new(input));
614
615 assert_eq!(
616 lexer.next_token().unwrap(),
617 Token::String(b"Hello World".to_vec())
618 );
619 assert_eq!(
620 lexer.next_token().unwrap(),
621 Token::String(b"Hello".to_vec())
622 );
623 }
624
625 #[test]
626 fn test_lexer_dictionaries() {
627 let input = b"<< /Type /Page >>";
628 let mut lexer = Lexer::new(Cursor::new(input));
629
630 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
631 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
632 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
633 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
634 }
635
636 #[test]
637 fn test_lexer_arrays() {
638 let input = b"[1 2 3]";
639 let mut lexer = Lexer::new(Cursor::new(input));
640
641 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
642 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
643 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
644 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
645 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
646 }
647
648 #[test]
649 fn test_lexer_references() {
650 let input = b"1 0 R 25 1 R";
651 let mut lexer = Lexer::new(Cursor::new(input));
652
653 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
655 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
656 match lexer.next_token().unwrap() {
658 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
660 }
661
662 assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
663 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
664 match lexer.next_token().unwrap() {
665 Token::Name(s) if s == "R" => {} other => panic!("Expected R token, got {other:?}"),
667 }
668 }
669
670 #[test]
671 fn test_lexer_comments() {
672 let input = b"%PDF-1.7\n123";
673 let mut lexer = Lexer::new(Cursor::new(input));
674
675 assert_eq!(
676 lexer.next_token().unwrap(),
677 Token::Comment("PDF-1.7".to_string())
678 );
679 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
680 }
681}