1use super::{ParseError, ParseResult};
6use std::io::Read;
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Boolean(bool),
13
14 Integer(i64),
16
17 Real(f64),
19
20 String(Vec<u8>),
22
23 Name(String),
25
26 ArrayStart,
28
29 ArrayEnd,
31
32 DictStart,
34
35 DictEnd,
37
38 Stream,
40
41 EndStream,
43
44 Obj,
46
47 EndObj,
49
50 StartXRef,
52
53 Reference(u32, u16),
55
56 Null,
58
59 Comment(String),
61
62 Eof,
64}
65
66pub struct Lexer<R: Read> {
68 reader: std::io::BufReader<R>,
69 buffer: Vec<u8>,
70 position: usize,
71 peek_buffer: Option<u8>,
72 token_buffer: Vec<Token>,
73}
74
75impl<R: Read> Lexer<R> {
76 pub fn new(reader: R) -> Self {
78 Self {
79 reader: std::io::BufReader::new(reader),
80 buffer: Vec::with_capacity(1024),
81 position: 0,
82 peek_buffer: None,
83 token_buffer: Vec::new(),
84 }
85 }
86
87 pub fn next_token(&mut self) -> ParseResult<Token> {
89 if let Some(token) = self.token_buffer.pop() {
91 return Ok(token);
92 }
93
94 self.skip_whitespace()?;
95
96 let ch = match self.peek_char()? {
97 Some(ch) => ch,
98 None => return Ok(Token::Eof),
99 };
100
101 match ch {
102 b'%' => self.read_comment(),
103 b'/' => self.read_name(),
104 b'(' => self.read_literal_string(),
105 b'<' => self.read_angle_bracket(),
106 b'>' => {
107 self.consume_char()?;
108 if self.peek_char()? == Some(b'>') {
109 self.consume_char()?;
110 Ok(Token::DictEnd)
111 } else {
112 Err(ParseError::SyntaxError {
113 position: self.position,
114 message: "Expected '>' after '>'".to_string(),
115 })
116 }
117 }
118 b'[' => {
119 self.consume_char()?;
120 Ok(Token::ArrayStart)
121 }
122 b']' => {
123 self.consume_char()?;
124 Ok(Token::ArrayEnd)
125 }
126 b't' | b'f' => self.read_boolean(),
127 b'n' => self.read_null(),
128 b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
129 b'R' => {
130 self.consume_char()?;
132 Ok(Token::Name("R".to_string()))
133 }
134 _ if ch.is_ascii_alphabetic() => self.read_keyword(),
135 _ => Err(ParseError::SyntaxError {
136 position: self.position,
137 message: format!("Unexpected character: {}", ch as char),
138 }),
139 }
140 }
141
142 fn peek_char(&mut self) -> ParseResult<Option<u8>> {
144 if let Some(ch) = self.peek_buffer {
145 return Ok(Some(ch));
146 }
147
148 let mut buf = [0u8; 1];
149 match self.reader.read_exact(&mut buf) {
150 Ok(_) => {
151 self.peek_buffer = Some(buf[0]);
152 Ok(Some(buf[0]))
153 }
154 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
155 Err(e) => Err(e.into()),
156 }
157 }
158
159 fn consume_char(&mut self) -> ParseResult<Option<u8>> {
161 let ch = self.peek_char()?;
162 if ch.is_some() {
163 self.peek_buffer = None;
164 self.position += 1;
165 }
166 Ok(ch)
167 }
168
169 pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
171 let mut count = 0;
172 while let Some(ch) = self.peek_char()? {
173 if ch.is_ascii_whitespace() {
174 self.consume_char()?;
175 count += 1;
176 } else {
177 break;
178 }
179 }
180 Ok(count)
181 }
182
183 fn read_comment(&mut self) -> ParseResult<Token> {
185 self.consume_char()?; let mut comment = String::new();
187
188 while let Some(ch) = self.peek_char()? {
189 if ch == b'\n' || ch == b'\r' {
190 break;
191 }
192 self.consume_char()?;
193 comment.push(ch as char);
194 }
195
196 Ok(Token::Comment(comment))
197 }
198
199 fn read_name(&mut self) -> ParseResult<Token> {
201 self.consume_char()?; let mut name = String::new();
203
204 while let Some(ch) = self.peek_char()? {
205 if ch.is_ascii_whitespace() ||
206 matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%') {
207 break;
208 }
209 self.consume_char()?;
210
211 if ch == b'#' {
213 let hex1 = self.consume_char()?.ok_or_else(|| ParseError::SyntaxError {
214 position: self.position,
215 message: "Incomplete hex code in name".to_string(),
216 })?;
217 let hex2 = self.consume_char()?.ok_or_else(|| ParseError::SyntaxError {
218 position: self.position,
219 message: "Incomplete hex code in name".to_string(),
220 })?;
221
222 let value = u8::from_str_radix(
223 &format!("{}{}", hex1 as char, hex2 as char),
224 16
225 ).map_err(|_| ParseError::SyntaxError {
226 position: self.position,
227 message: "Invalid hex code in name".to_string(),
228 })?;
229
230 name.push(value as char);
231 } else {
232 name.push(ch as char);
233 }
234 }
235
236 Ok(Token::Name(name))
237 }
238
239 fn read_literal_string(&mut self) -> ParseResult<Token> {
241 self.consume_char()?; let mut string = Vec::new();
243 let mut paren_depth = 1;
244 let mut escape = false;
245
246 while paren_depth > 0 {
247 let ch = self.consume_char()?.ok_or_else(|| ParseError::SyntaxError {
248 position: self.position,
249 message: "Unterminated string".to_string(),
250 })?;
251
252 if escape {
253 let escaped = match ch {
254 b'n' => b'\n',
255 b'r' => b'\r',
256 b't' => b'\t',
257 b'b' => b'\x08',
258 b'f' => b'\x0C',
259 b'(' => b'(',
260 b')' => b')',
261 b'\\' => b'\\',
262 b'0'..=b'7' => {
263 let mut value = ch - b'0';
265 for _ in 0..2 {
266 if let Some(next) = self.peek_char()? {
267 if matches!(next, b'0'..=b'7') {
268 self.consume_char()?;
269 value = value * 8 + (next - b'0');
270 } else {
271 break;
272 }
273 }
274 }
275 value
276 }
277 _ => ch, };
279 string.push(escaped);
280 escape = false;
281 } else {
282 match ch {
283 b'\\' => escape = true,
284 b'(' => {
285 string.push(ch);
286 paren_depth += 1;
287 }
288 b')' => {
289 paren_depth -= 1;
290 if paren_depth > 0 {
291 string.push(ch);
292 }
293 }
294 _ => string.push(ch),
295 }
296 }
297 }
298
299 Ok(Token::String(string))
300 }
301
302 fn read_angle_bracket(&mut self) -> ParseResult<Token> {
304 self.consume_char()?; if self.peek_char()? == Some(b'<') {
307 self.consume_char()?;
308 Ok(Token::DictStart)
309 } else {
310 let mut hex_chars = String::new();
312 let mut found_end = false;
313
314 while let Some(ch) = self.peek_char()? {
315 if ch == b'>' {
316 self.consume_char()?;
317 found_end = true;
318 break;
319 }
320 self.consume_char()?;
321 if ch.is_ascii_hexdigit() {
322 hex_chars.push(ch as char);
323 } else if !ch.is_ascii_whitespace() {
324 return Err(ParseError::SyntaxError {
325 position: self.position,
326 message: "Invalid character in hex string".to_string(),
327 });
328 }
329 }
330
331 if !found_end {
332 return Err(ParseError::SyntaxError {
333 position: self.position,
334 message: "Unterminated hex string".to_string(),
335 });
336 }
337
338 if hex_chars.len() % 2 != 0 {
340 hex_chars.push('0');
341 }
342
343 let mut bytes = Vec::new();
345 for chunk in hex_chars.as_bytes().chunks(2) {
346 let hex_str = std::str::from_utf8(chunk).unwrap();
347 let byte = u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
348 position: self.position,
349 message: "Invalid hex string".to_string(),
350 })?;
351 bytes.push(byte);
352 }
353
354 Ok(Token::String(bytes))
355 }
356 }
357
358 fn read_boolean(&mut self) -> ParseResult<Token> {
360 let word = self.read_word()?;
361 match word.as_str() {
362 "true" => Ok(Token::Boolean(true)),
363 "false" => Ok(Token::Boolean(false)),
364 _ => {
365 self.process_keyword(word)
367 }
368 }
369 }
370
371 fn read_null(&mut self) -> ParseResult<Token> {
373 let word = self.read_word()?;
374 if word == "null" {
375 Ok(Token::Null)
376 } else {
377 self.process_keyword(word)
379 }
380 }
381
382 fn read_number(&mut self) -> ParseResult<Token> {
384 let mut number_str = String::new();
385 let mut has_dot = false;
386
387 if let Some(ch) = self.peek_char()? {
389 if ch == b'+' || ch == b'-' {
390 self.consume_char()?;
391 number_str.push(ch as char);
392
393 if let Some(next) = self.peek_char()? {
395 if !next.is_ascii_digit() && next != b'.' {
396 return Err(ParseError::SyntaxError {
397 position: self.position,
398 message: "Expected digit after sign".to_string(),
399 });
400 }
401 }
402 }
403 }
404
405 while let Some(ch) = self.peek_char()? {
407 match ch {
408 b'0'..=b'9' => {
409 self.consume_char()?;
410 number_str.push(ch as char);
411 }
412 b'.' if !has_dot => {
413 self.consume_char()?;
414 number_str.push(ch as char);
415 has_dot = true;
416 }
417 _ => break,
418 }
419 }
420
421 if has_dot {
426 let value = number_str.parse::<f64>().map_err(|_| ParseError::SyntaxError {
427 position: self.position,
428 message: format!("Invalid real number: '{}'", number_str),
429 })?;
430 Ok(Token::Real(value))
431 } else {
432 let value = number_str.parse::<i64>().map_err(|_| ParseError::SyntaxError {
433 position: self.position,
434 message: format!("Invalid integer: '{}'", number_str),
435 })?;
436 Ok(Token::Integer(value))
437 }
438 }
439
440 fn read_keyword(&mut self) -> ParseResult<Token> {
442 let word = self.read_word()?;
443 self.process_keyword(word)
444 }
445
446 fn process_keyword(&self, word: String) -> ParseResult<Token> {
448 match word.as_str() {
449 "stream" => Ok(Token::Stream),
450 "endstream" => Ok(Token::EndStream),
451 "obj" => Ok(Token::Obj),
452 "endobj" => Ok(Token::EndObj),
453 "startxref" => Ok(Token::StartXRef),
454 _ => Err(ParseError::SyntaxError {
455 position: self.position,
456 message: format!("Unknown keyword: {}", word),
457 }),
458 }
459 }
460
461 fn read_word(&mut self) -> ParseResult<String> {
463 let mut word = String::new();
464
465 while let Some(ch) = self.peek_char()? {
466 if ch.is_ascii_whitespace() ||
467 matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%') {
468 break;
469 }
470 self.consume_char()?;
471 word.push(ch as char);
472 }
473
474 Ok(word)
475 }
476
477 fn read_digits(&mut self) -> ParseResult<String> {
479 let mut digits = String::new();
480
481 while let Some(ch) = self.peek_char()? {
482 if ch.is_ascii_digit() {
483 self.consume_char()?;
484 digits.push(ch as char);
485 } else {
486 break;
487 }
488 }
489
490 Ok(digits)
491 }
492
493 pub fn read_newline(&mut self) -> ParseResult<()> {
495 match self.peek_char()? {
496 Some(b'\r') => {
497 self.consume_char()?;
498 if self.peek_char()? == Some(b'\n') {
500 self.consume_char()?;
501 }
502 Ok(())
503 }
504 Some(b'\n') => {
505 self.consume_char()?;
506 Ok(())
507 }
508 _ => Err(ParseError::SyntaxError {
509 position: self.position,
510 message: "Expected newline".to_string(),
511 }),
512 }
513 }
514
515 pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
517 let mut bytes = vec![0u8; n];
518 self.reader.read_exact(&mut bytes)?;
519 self.position += n;
520 Ok(bytes)
521 }
522
523 pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
525 let mut result = Vec::new();
526 let mut match_pos = 0;
527
528 while let Some(ch) = self.consume_char()? {
529 result.push(ch);
530
531 if ch == sequence[match_pos] {
532 match_pos += 1;
533 if match_pos == sequence.len() {
534 result.truncate(result.len() - sequence.len());
536 break;
537 }
538 } else if ch == sequence[0] {
539 match_pos = 1;
540 } else {
541 match_pos = 0;
542 }
543 }
544
545 if match_pos < sequence.len() {
546 return Err(ParseError::SyntaxError {
547 position: self.position,
548 message: format!("Sequence {:?} not found", sequence),
549 });
550 }
551
552 Ok(result)
553 }
554
555 pub fn position(&self) -> usize {
557 self.position
558 }
559
560 pub fn push_token(&mut self, token: Token) {
562 self.token_buffer.push(token);
563 }
564}
565
566#[cfg(test)]
567mod tests {
568 use super::*;
569 use std::io::Cursor;
570
571 #[test]
572 fn test_lexer_basic_tokens() {
573 let input = b"123 -456 3.14 true false null /Name";
575 let mut lexer = Lexer::new(Cursor::new(input));
576
577 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
578 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
579 assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
580 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
581 assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
582 assert_eq!(lexer.next_token().unwrap(), Token::Null);
583 assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
584 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
585 }
586
587 #[test]
588 fn test_lexer_negative_numbers() {
589 let input = b"-123 -45.67";
591 let mut lexer = Lexer::new(Cursor::new(input));
592
593 assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
594 assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
595 }
596
597 #[test]
598 fn test_lexer_strings() {
599 let input = b"(Hello World) <48656C6C6F>";
600 let mut lexer = Lexer::new(Cursor::new(input));
601
602 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello World".to_vec()));
603 assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello".to_vec()));
604 }
605
606 #[test]
607 fn test_lexer_dictionaries() {
608 let input = b"<< /Type /Page >>";
609 let mut lexer = Lexer::new(Cursor::new(input));
610
611 assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
612 assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
613 assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
614 assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
615 }
616
617 #[test]
618 fn test_lexer_arrays() {
619 let input = b"[1 2 3]";
620 let mut lexer = Lexer::new(Cursor::new(input));
621
622 assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
623 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
624 assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
625 assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
626 assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
627 }
628
629
630 #[test]
631 fn test_lexer_references() {
632 let input = b"1 0 R 25 1 R";
633 let mut lexer = Lexer::new(Cursor::new(input));
634
635 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
637 assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
638 match lexer.next_token().unwrap() {
640 Token::Name(s) if s == "R" => {}, other => panic!("Expected R token, got {:?}", other),
642 }
643
644 assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
645 assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
646 match lexer.next_token().unwrap() {
647 Token::Name(s) if s == "R" => {}, other => panic!("Expected R token, got {:?}", other),
649 }
650 }
651
652 #[test]
653 fn test_lexer_comments() {
654 let input = b"%PDF-1.7\n123";
655 let mut lexer = Lexer::new(Cursor::new(input));
656
657 assert_eq!(lexer.next_token().unwrap(), Token::Comment("PDF-1.7".to_string()));
658 assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
659 }
660}