oxidize_pdf/parser/
lexer.rs

1//! PDF Lexer
2//!
3//! Tokenizes PDF syntax according to ISO 32000-1 Section 7.2
4
5use super::{ParseError, ParseResult};
6use std::io::Read;
7
8/// PDF Token types
9#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11    /// Boolean: true or false
12    Boolean(bool),
13
14    /// Integer number
15    Integer(i64),
16
17    /// Real number
18    Real(f64),
19
20    /// String (literal or hexadecimal)
21    String(Vec<u8>),
22
23    /// Name object (e.g., /Type)
24    Name(String),
25
26    /// Left square bracket [
27    ArrayStart,
28
29    /// Right square bracket ]
30    ArrayEnd,
31
32    /// Dictionary start <<
33    DictStart,
34
35    /// Dictionary end >>
36    DictEnd,
37
38    /// Stream keyword
39    Stream,
40
41    /// Endstream keyword
42    EndStream,
43
44    /// Obj keyword
45    Obj,
46
47    /// Endobj keyword
48    EndObj,
49
50    /// StartXRef keyword
51    StartXRef,
52
53    /// Reference (e.g., 1 0 R)
54    Reference(u32, u16),
55
56    /// Null object
57    Null,
58
59    /// Comment (usually ignored)
60    Comment(String),
61
62    /// End of file
63    Eof,
64}
65
66/// PDF Lexer for tokenizing PDF content
67pub struct Lexer<R: Read> {
68    reader: std::io::BufReader<R>,
69    #[allow(dead_code)]
70    buffer: Vec<u8>,
71    position: usize,
72    peek_buffer: Option<u8>,
73    token_buffer: Vec<Token>,
74}
75
76impl<R: Read> Lexer<R> {
77    /// Create a new lexer from a reader
78    pub fn new(reader: R) -> Self {
79        Self {
80            reader: std::io::BufReader::new(reader),
81            buffer: Vec::with_capacity(1024),
82            position: 0,
83            peek_buffer: None,
84            token_buffer: Vec::new(),
85        }
86    }
87
88    /// Get the next token
89    pub fn next_token(&mut self) -> ParseResult<Token> {
90        // Check if we have a pushed-back token
91        if let Some(token) = self.token_buffer.pop() {
92            return Ok(token);
93        }
94
95        self.skip_whitespace()?;
96
97        let ch = match self.peek_char()? {
98            Some(ch) => ch,
99            None => return Ok(Token::Eof),
100        };
101
102        match ch {
103            b'%' => self.read_comment(),
104            b'/' => self.read_name(),
105            b'(' => self.read_literal_string(),
106            b'<' => self.read_angle_bracket(),
107            b'>' => {
108                self.consume_char()?;
109                if self.peek_char()? == Some(b'>') {
110                    self.consume_char()?;
111                    Ok(Token::DictEnd)
112                } else {
113                    Err(ParseError::SyntaxError {
114                        position: self.position,
115                        message: "Expected '>' after '>'".to_string(),
116                    })
117                }
118            }
119            b'[' => {
120                self.consume_char()?;
121                Ok(Token::ArrayStart)
122            }
123            b']' => {
124                self.consume_char()?;
125                Ok(Token::ArrayEnd)
126            }
127            b't' | b'f' => self.read_boolean(),
128            b'n' => self.read_null(),
129            b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
130            b'R' => {
131                // R could be a keyword (for references)
132                self.consume_char()?;
133                Ok(Token::Name("R".to_string()))
134            }
135            _ if ch.is_ascii_alphabetic() => self.read_keyword(),
136            _ => Err(ParseError::SyntaxError {
137                position: self.position,
138                message: format!("Unexpected character: {}", ch as char),
139            }),
140        }
141    }
142
143    /// Peek at the next character without consuming it
144    fn peek_char(&mut self) -> ParseResult<Option<u8>> {
145        if let Some(ch) = self.peek_buffer {
146            return Ok(Some(ch));
147        }
148
149        let mut buf = [0u8; 1];
150        match self.reader.read_exact(&mut buf) {
151            Ok(_) => {
152                self.peek_buffer = Some(buf[0]);
153                Ok(Some(buf[0]))
154            }
155            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
156            Err(e) => Err(e.into()),
157        }
158    }
159
160    /// Consume the next character
161    fn consume_char(&mut self) -> ParseResult<Option<u8>> {
162        let ch = self.peek_char()?;
163        if ch.is_some() {
164            self.peek_buffer = None;
165            self.position += 1;
166        }
167        Ok(ch)
168    }
169
170    /// Skip whitespace and return the number of bytes skipped
171    pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
172        let mut count = 0;
173        while let Some(ch) = self.peek_char()? {
174            if ch.is_ascii_whitespace() {
175                self.consume_char()?;
176                count += 1;
177            } else {
178                break;
179            }
180        }
181        Ok(count)
182    }
183
184    /// Read a comment (from % to end of line)
185    fn read_comment(&mut self) -> ParseResult<Token> {
186        self.consume_char()?; // consume '%'
187        let mut comment = String::new();
188
189        while let Some(ch) = self.peek_char()? {
190            if ch == b'\n' || ch == b'\r' {
191                break;
192            }
193            self.consume_char()?;
194            comment.push(ch as char);
195        }
196
197        Ok(Token::Comment(comment))
198    }
199
200    /// Read a name object (e.g., /Type)
201    fn read_name(&mut self) -> ParseResult<Token> {
202        self.consume_char()?; // consume '/'
203        let mut name = String::new();
204
205        while let Some(ch) = self.peek_char()? {
206            if ch.is_ascii_whitespace()
207                || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
208            {
209                break;
210            }
211            self.consume_char()?;
212
213            // Handle hex codes in names (e.g., /A#20B means /A B)
214            if ch == b'#' {
215                let hex1 = self
216                    .consume_char()?
217                    .ok_or_else(|| ParseError::SyntaxError {
218                        position: self.position,
219                        message: "Incomplete hex code in name".to_string(),
220                    })?;
221                let hex2 = self
222                    .consume_char()?
223                    .ok_or_else(|| ParseError::SyntaxError {
224                        position: self.position,
225                        message: "Incomplete hex code in name".to_string(),
226                    })?;
227
228                let value = u8::from_str_radix(&format!("{}{}", hex1 as char, hex2 as char), 16)
229                    .map_err(|_| ParseError::SyntaxError {
230                        position: self.position,
231                        message: "Invalid hex code in name".to_string(),
232                    })?;
233
234                name.push(value as char);
235            } else {
236                name.push(ch as char);
237            }
238        }
239
240        Ok(Token::Name(name))
241    }
242
243    /// Read a literal string (parentheses)
244    fn read_literal_string(&mut self) -> ParseResult<Token> {
245        self.consume_char()?; // consume '('
246        let mut string = Vec::new();
247        let mut paren_depth = 1;
248        let mut escape = false;
249
250        while paren_depth > 0 {
251            let ch = self
252                .consume_char()?
253                .ok_or_else(|| ParseError::SyntaxError {
254                    position: self.position,
255                    message: "Unterminated string".to_string(),
256                })?;
257
258            if escape {
259                let escaped = match ch {
260                    b'n' => b'\n',
261                    b'r' => b'\r',
262                    b't' => b'\t',
263                    b'b' => b'\x08',
264                    b'f' => b'\x0C',
265                    b'(' => b'(',
266                    b')' => b')',
267                    b'\\' => b'\\',
268                    b'0'..=b'7' => {
269                        // Octal escape sequence
270                        let mut value = ch - b'0';
271                        for _ in 0..2 {
272                            if let Some(next) = self.peek_char()? {
273                                if matches!(next, b'0'..=b'7') {
274                                    self.consume_char()?;
275                                    value = value * 8 + (next - b'0');
276                                } else {
277                                    break;
278                                }
279                            }
280                        }
281                        value
282                    }
283                    _ => ch, // Unknown escape, use literal
284                };
285                string.push(escaped);
286                escape = false;
287            } else {
288                match ch {
289                    b'\\' => escape = true,
290                    b'(' => {
291                        string.push(ch);
292                        paren_depth += 1;
293                    }
294                    b')' => {
295                        paren_depth -= 1;
296                        if paren_depth > 0 {
297                            string.push(ch);
298                        }
299                    }
300                    _ => string.push(ch),
301                }
302            }
303        }
304
305        Ok(Token::String(string))
306    }
307
308    /// Read angle bracket tokens (hex strings or dict markers)
309    fn read_angle_bracket(&mut self) -> ParseResult<Token> {
310        self.consume_char()?; // consume '<'
311
312        if self.peek_char()? == Some(b'<') {
313            self.consume_char()?;
314            Ok(Token::DictStart)
315        } else {
316            // Hex string
317            let mut hex_chars = String::new();
318            let mut found_end = false;
319
320            while let Some(ch) = self.peek_char()? {
321                if ch == b'>' {
322                    self.consume_char()?;
323                    found_end = true;
324                    break;
325                }
326                self.consume_char()?;
327                if ch.is_ascii_hexdigit() {
328                    hex_chars.push(ch as char);
329                } else if !ch.is_ascii_whitespace() {
330                    return Err(ParseError::SyntaxError {
331                        position: self.position,
332                        message: "Invalid character in hex string".to_string(),
333                    });
334                }
335            }
336
337            if !found_end {
338                return Err(ParseError::SyntaxError {
339                    position: self.position,
340                    message: "Unterminated hex string".to_string(),
341                });
342            }
343
344            // Pad with 0 if odd number of digits
345            if hex_chars.len() % 2 != 0 {
346                hex_chars.push('0');
347            }
348
349            // Convert hex to bytes
350            let mut bytes = Vec::new();
351            for chunk in hex_chars.as_bytes().chunks(2) {
352                let hex_str = std::str::from_utf8(chunk).unwrap();
353                let byte =
354                    u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
355                        position: self.position,
356                        message: "Invalid hex string".to_string(),
357                    })?;
358                bytes.push(byte);
359            }
360
361            Ok(Token::String(bytes))
362        }
363    }
364
365    /// Read boolean (true/false)
366    fn read_boolean(&mut self) -> ParseResult<Token> {
367        let word = self.read_word()?;
368        match word.as_str() {
369            "true" => Ok(Token::Boolean(true)),
370            "false" => Ok(Token::Boolean(false)),
371            _ => {
372                // Not a boolean, might be a keyword
373                self.process_keyword(word)
374            }
375        }
376    }
377
378    /// Read null
379    fn read_null(&mut self) -> ParseResult<Token> {
380        let word = self.read_word()?;
381        if word == "null" {
382            Ok(Token::Null)
383        } else {
384            // Not null, might be a keyword
385            self.process_keyword(word)
386        }
387    }
388
389    /// Read a number (integer or real)
390    fn read_number(&mut self) -> ParseResult<Token> {
391        let mut number_str = String::new();
392        let mut has_dot = false;
393
394        // Handle sign - consume it first
395        if let Some(ch) = self.peek_char()? {
396            if ch == b'+' || ch == b'-' {
397                self.consume_char()?;
398                number_str.push(ch as char);
399
400                // After sign, we must have at least one digit
401                if let Some(next) = self.peek_char()? {
402                    if !next.is_ascii_digit() && next != b'.' {
403                        return Err(ParseError::SyntaxError {
404                            position: self.position,
405                            message: "Expected digit after sign".to_string(),
406                        });
407                    }
408                }
409            }
410        }
411
412        // Read digits and decimal point
413        while let Some(ch) = self.peek_char()? {
414            match ch {
415                b'0'..=b'9' => {
416                    self.consume_char()?;
417                    number_str.push(ch as char);
418                }
419                b'.' if !has_dot => {
420                    self.consume_char()?;
421                    number_str.push(ch as char);
422                    has_dot = true;
423                }
424                _ => break,
425            }
426        }
427
428        // Don't try to parse references here - let the parser handle it
429        // References are just "num num R" and can be handled at a higher level
430
431        // Parse as number
432        if has_dot {
433            let value = number_str
434                .parse::<f64>()
435                .map_err(|_| ParseError::SyntaxError {
436                    position: self.position,
437                    message: format!("Invalid real number: '{number_str}'"),
438                })?;
439            Ok(Token::Real(value))
440        } else {
441            let value = number_str
442                .parse::<i64>()
443                .map_err(|_| ParseError::SyntaxError {
444                    position: self.position,
445                    message: format!("Invalid integer: '{number_str}'"),
446                })?;
447            Ok(Token::Integer(value))
448        }
449    }
450
451    /// Read a keyword
452    fn read_keyword(&mut self) -> ParseResult<Token> {
453        let word = self.read_word()?;
454        self.process_keyword(word)
455    }
456
457    /// Process a word as a keyword
458    fn process_keyword(&self, word: String) -> ParseResult<Token> {
459        match word.as_str() {
460            "stream" => Ok(Token::Stream),
461            "endstream" => Ok(Token::EndStream),
462            "obj" => Ok(Token::Obj),
463            "endobj" => Ok(Token::EndObj),
464            "startxref" => Ok(Token::StartXRef),
465            _ => Err(ParseError::SyntaxError {
466                position: self.position,
467                message: format!("Unknown keyword: {word}"),
468            }),
469        }
470    }
471
472    /// Read a word (sequence of non-delimiter characters)
473    fn read_word(&mut self) -> ParseResult<String> {
474        let mut word = String::new();
475
476        while let Some(ch) = self.peek_char()? {
477            if ch.is_ascii_whitespace()
478                || matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%')
479            {
480                break;
481            }
482            self.consume_char()?;
483            word.push(ch as char);
484        }
485
486        Ok(word)
487    }
488
489    /// Read a sequence of digits
490    #[allow(dead_code)]
491    fn read_digits(&mut self) -> ParseResult<String> {
492        let mut digits = String::new();
493
494        while let Some(ch) = self.peek_char()? {
495            if ch.is_ascii_digit() {
496                self.consume_char()?;
497                digits.push(ch as char);
498            } else {
499                break;
500            }
501        }
502
503        Ok(digits)
504    }
505
506    /// Read a newline sequence (CR, LF, or CRLF)
507    pub fn read_newline(&mut self) -> ParseResult<()> {
508        match self.peek_char()? {
509            Some(b'\r') => {
510                self.consume_char()?;
511                // Check for CRLF
512                if self.peek_char()? == Some(b'\n') {
513                    self.consume_char()?;
514                }
515                Ok(())
516            }
517            Some(b'\n') => {
518                self.consume_char()?;
519                Ok(())
520            }
521            _ => Err(ParseError::SyntaxError {
522                position: self.position,
523                message: "Expected newline".to_string(),
524            }),
525        }
526    }
527
528    /// Read exactly n bytes
529    pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
530        let mut bytes = vec![0u8; n];
531        self.reader.read_exact(&mut bytes)?;
532        self.position += n;
533        Ok(bytes)
534    }
535
536    /// Read until a specific byte sequence is found
537    pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
538        let mut result = Vec::new();
539        let mut match_pos = 0;
540
541        while let Some(ch) = self.consume_char()? {
542            result.push(ch);
543
544            if ch == sequence[match_pos] {
545                match_pos += 1;
546                if match_pos == sequence.len() {
547                    // Found the sequence, remove it from result
548                    result.truncate(result.len() - sequence.len());
549                    break;
550                }
551            } else if ch == sequence[0] {
552                match_pos = 1;
553            } else {
554                match_pos = 0;
555            }
556        }
557
558        if match_pos < sequence.len() {
559            return Err(ParseError::SyntaxError {
560                position: self.position,
561                message: format!("Sequence {sequence:?} not found"),
562            });
563        }
564
565        Ok(result)
566    }
567
568    /// Get current position
569    pub fn position(&self) -> usize {
570        self.position
571    }
572
573    /// Push back a token to be returned by the next call to next_token
574    pub fn push_token(&mut self, token: Token) {
575        self.token_buffer.push(token);
576    }
577}
578
579#[cfg(test)]
580mod tests {
581    use super::*;
582    use std::io::Cursor;
583
584    #[test]
585    fn test_lexer_basic_tokens() {
586        // Test positive and negative numbers
587        let input = b"123 -456 3.14 true false null /Name";
588        let mut lexer = Lexer::new(Cursor::new(input));
589
590        assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
591        assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
592        assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
593        assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
594        assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
595        assert_eq!(lexer.next_token().unwrap(), Token::Null);
596        assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
597        assert_eq!(lexer.next_token().unwrap(), Token::Eof);
598    }
599
600    #[test]
601    fn test_lexer_negative_numbers() {
602        // Test negative numbers without space
603        let input = b"-123 -45.67";
604        let mut lexer = Lexer::new(Cursor::new(input));
605
606        assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
607        assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
608    }
609
610    #[test]
611    fn test_lexer_strings() {
612        let input = b"(Hello World) <48656C6C6F>";
613        let mut lexer = Lexer::new(Cursor::new(input));
614
615        assert_eq!(
616            lexer.next_token().unwrap(),
617            Token::String(b"Hello World".to_vec())
618        );
619        assert_eq!(
620            lexer.next_token().unwrap(),
621            Token::String(b"Hello".to_vec())
622        );
623    }
624
625    #[test]
626    fn test_lexer_dictionaries() {
627        let input = b"<< /Type /Page >>";
628        let mut lexer = Lexer::new(Cursor::new(input));
629
630        assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
631        assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
632        assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
633        assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
634    }
635
636    #[test]
637    fn test_lexer_arrays() {
638        let input = b"[1 2 3]";
639        let mut lexer = Lexer::new(Cursor::new(input));
640
641        assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
642        assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
643        assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
644        assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
645        assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
646    }
647
648    #[test]
649    fn test_lexer_references() {
650        let input = b"1 0 R 25 1 R";
651        let mut lexer = Lexer::new(Cursor::new(input));
652
653        // Now references are parsed as separate tokens
654        assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
655        assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
656        // 'R' should be parsed as a keyword or name
657        match lexer.next_token().unwrap() {
658            Token::Name(s) if s == "R" => {} // Could be a name
659            other => panic!("Expected R token, got {other:?}"),
660        }
661
662        assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
663        assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
664        match lexer.next_token().unwrap() {
665            Token::Name(s) if s == "R" => {} // Could be a name
666            other => panic!("Expected R token, got {other:?}"),
667        }
668    }
669
670    #[test]
671    fn test_lexer_comments() {
672        let input = b"%PDF-1.7\n123";
673        let mut lexer = Lexer::new(Cursor::new(input));
674
675        assert_eq!(
676            lexer.next_token().unwrap(),
677            Token::Comment("PDF-1.7".to_string())
678        );
679        assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
680    }
681}