oxidize_pdf/parser/
lexer.rs

1//! PDF Lexer
2//! 
3//! Tokenizes PDF syntax according to ISO 32000-1 Section 7.2
4
5use super::{ParseError, ParseResult};
6use std::io::Read;
7
8/// PDF Token types
9#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11    /// Boolean: true or false
12    Boolean(bool),
13    
14    /// Integer number
15    Integer(i64),
16    
17    /// Real number
18    Real(f64),
19    
20    /// String (literal or hexadecimal)
21    String(Vec<u8>),
22    
23    /// Name object (e.g., /Type)
24    Name(String),
25    
26    /// Left square bracket [
27    ArrayStart,
28    
29    /// Right square bracket ]
30    ArrayEnd,
31    
32    /// Dictionary start <<
33    DictStart,
34    
35    /// Dictionary end >>
36    DictEnd,
37    
38    /// Stream keyword
39    Stream,
40    
41    /// Endstream keyword
42    EndStream,
43    
44    /// Obj keyword
45    Obj,
46    
47    /// Endobj keyword
48    EndObj,
49    
50    /// StartXRef keyword
51    StartXRef,
52    
53    /// Reference (e.g., 1 0 R)
54    Reference(u32, u16),
55    
56    /// Null object
57    Null,
58    
59    /// Comment (usually ignored)
60    Comment(String),
61    
62    /// End of file
63    Eof,
64}
65
66/// PDF Lexer for tokenizing PDF content
67pub struct Lexer<R: Read> {
68    reader: std::io::BufReader<R>,
69    buffer: Vec<u8>,
70    position: usize,
71    peek_buffer: Option<u8>,
72    token_buffer: Vec<Token>,
73}
74
75impl<R: Read> Lexer<R> {
76    /// Create a new lexer from a reader
77    pub fn new(reader: R) -> Self {
78        Self {
79            reader: std::io::BufReader::new(reader),
80            buffer: Vec::with_capacity(1024),
81            position: 0,
82            peek_buffer: None,
83            token_buffer: Vec::new(),
84        }
85    }
86    
87    /// Get the next token
88    pub fn next_token(&mut self) -> ParseResult<Token> {
89        // Check if we have a pushed-back token
90        if let Some(token) = self.token_buffer.pop() {
91            return Ok(token);
92        }
93        
94        self.skip_whitespace()?;
95        
96        let ch = match self.peek_char()? {
97            Some(ch) => ch,
98            None => return Ok(Token::Eof),
99        };
100        
101        match ch {
102            b'%' => self.read_comment(),
103            b'/' => self.read_name(),
104            b'(' => self.read_literal_string(),
105            b'<' => self.read_angle_bracket(),
106            b'>' => {
107                self.consume_char()?;
108                if self.peek_char()? == Some(b'>') {
109                    self.consume_char()?;
110                    Ok(Token::DictEnd)
111                } else {
112                    Err(ParseError::SyntaxError {
113                        position: self.position,
114                        message: "Expected '>' after '>'".to_string(),
115                    })
116                }
117            }
118            b'[' => {
119                self.consume_char()?;
120                Ok(Token::ArrayStart)
121            }
122            b']' => {
123                self.consume_char()?;
124                Ok(Token::ArrayEnd)
125            }
126            b't' | b'f' => self.read_boolean(),
127            b'n' => self.read_null(),
128            b'+' | b'-' | b'0'..=b'9' | b'.' => self.read_number(),
129            b'R' => {
130                // R could be a keyword (for references)
131                self.consume_char()?;
132                Ok(Token::Name("R".to_string()))
133            }
134            _ if ch.is_ascii_alphabetic() => self.read_keyword(),
135            _ => Err(ParseError::SyntaxError {
136                position: self.position,
137                message: format!("Unexpected character: {}", ch as char),
138            }),
139        }
140    }
141    
142    /// Peek at the next character without consuming it
143    fn peek_char(&mut self) -> ParseResult<Option<u8>> {
144        if let Some(ch) = self.peek_buffer {
145            return Ok(Some(ch));
146        }
147        
148        let mut buf = [0u8; 1];
149        match self.reader.read_exact(&mut buf) {
150            Ok(_) => {
151                self.peek_buffer = Some(buf[0]);
152                Ok(Some(buf[0]))
153            }
154            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
155            Err(e) => Err(e.into()),
156        }
157    }
158    
159    /// Consume the next character
160    fn consume_char(&mut self) -> ParseResult<Option<u8>> {
161        let ch = self.peek_char()?;
162        if ch.is_some() {
163            self.peek_buffer = None;
164            self.position += 1;
165        }
166        Ok(ch)
167    }
168    
169    /// Skip whitespace and return the number of bytes skipped
170    pub(crate) fn skip_whitespace(&mut self) -> ParseResult<usize> {
171        let mut count = 0;
172        while let Some(ch) = self.peek_char()? {
173            if ch.is_ascii_whitespace() {
174                self.consume_char()?;
175                count += 1;
176            } else {
177                break;
178            }
179        }
180        Ok(count)
181    }
182    
183    /// Read a comment (from % to end of line)
184    fn read_comment(&mut self) -> ParseResult<Token> {
185        self.consume_char()?; // consume '%'
186        let mut comment = String::new();
187        
188        while let Some(ch) = self.peek_char()? {
189            if ch == b'\n' || ch == b'\r' {
190                break;
191            }
192            self.consume_char()?;
193            comment.push(ch as char);
194        }
195        
196        Ok(Token::Comment(comment))
197    }
198    
199    /// Read a name object (e.g., /Type)
200    fn read_name(&mut self) -> ParseResult<Token> {
201        self.consume_char()?; // consume '/'
202        let mut name = String::new();
203        
204        while let Some(ch) = self.peek_char()? {
205            if ch.is_ascii_whitespace() || 
206               matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%') {
207                break;
208            }
209            self.consume_char()?;
210            
211            // Handle hex codes in names (e.g., /A#20B means /A B)
212            if ch == b'#' {
213                let hex1 = self.consume_char()?.ok_or_else(|| ParseError::SyntaxError {
214                    position: self.position,
215                    message: "Incomplete hex code in name".to_string(),
216                })?;
217                let hex2 = self.consume_char()?.ok_or_else(|| ParseError::SyntaxError {
218                    position: self.position,
219                    message: "Incomplete hex code in name".to_string(),
220                })?;
221                
222                let value = u8::from_str_radix(
223                    &format!("{}{}", hex1 as char, hex2 as char),
224                    16
225                ).map_err(|_| ParseError::SyntaxError {
226                    position: self.position,
227                    message: "Invalid hex code in name".to_string(),
228                })?;
229                
230                name.push(value as char);
231            } else {
232                name.push(ch as char);
233            }
234        }
235        
236        Ok(Token::Name(name))
237    }
238    
239    /// Read a literal string (parentheses)
240    fn read_literal_string(&mut self) -> ParseResult<Token> {
241        self.consume_char()?; // consume '('
242        let mut string = Vec::new();
243        let mut paren_depth = 1;
244        let mut escape = false;
245        
246        while paren_depth > 0 {
247            let ch = self.consume_char()?.ok_or_else(|| ParseError::SyntaxError {
248                position: self.position,
249                message: "Unterminated string".to_string(),
250            })?;
251            
252            if escape {
253                let escaped = match ch {
254                    b'n' => b'\n',
255                    b'r' => b'\r',
256                    b't' => b'\t',
257                    b'b' => b'\x08',
258                    b'f' => b'\x0C',
259                    b'(' => b'(',
260                    b')' => b')',
261                    b'\\' => b'\\',
262                    b'0'..=b'7' => {
263                        // Octal escape sequence
264                        let mut value = ch - b'0';
265                        for _ in 0..2 {
266                            if let Some(next) = self.peek_char()? {
267                                if matches!(next, b'0'..=b'7') {
268                                    self.consume_char()?;
269                                    value = value * 8 + (next - b'0');
270                                } else {
271                                    break;
272                                }
273                            }
274                        }
275                        value
276                    }
277                    _ => ch, // Unknown escape, use literal
278                };
279                string.push(escaped);
280                escape = false;
281            } else {
282                match ch {
283                    b'\\' => escape = true,
284                    b'(' => {
285                        string.push(ch);
286                        paren_depth += 1;
287                    }
288                    b')' => {
289                        paren_depth -= 1;
290                        if paren_depth > 0 {
291                            string.push(ch);
292                        }
293                    }
294                    _ => string.push(ch),
295                }
296            }
297        }
298        
299        Ok(Token::String(string))
300    }
301    
302    /// Read angle bracket tokens (hex strings or dict markers)
303    fn read_angle_bracket(&mut self) -> ParseResult<Token> {
304        self.consume_char()?; // consume '<'
305        
306        if self.peek_char()? == Some(b'<') {
307            self.consume_char()?;
308            Ok(Token::DictStart)
309        } else {
310            // Hex string
311            let mut hex_chars = String::new();
312            let mut found_end = false;
313            
314            while let Some(ch) = self.peek_char()? {
315                if ch == b'>' {
316                    self.consume_char()?;
317                    found_end = true;
318                    break;
319                }
320                self.consume_char()?;
321                if ch.is_ascii_hexdigit() {
322                    hex_chars.push(ch as char);
323                } else if !ch.is_ascii_whitespace() {
324                    return Err(ParseError::SyntaxError {
325                        position: self.position,
326                        message: "Invalid character in hex string".to_string(),
327                    });
328                }
329            }
330            
331            if !found_end {
332                return Err(ParseError::SyntaxError {
333                    position: self.position,
334                    message: "Unterminated hex string".to_string(),
335                });
336            }
337            
338            // Pad with 0 if odd number of digits
339            if hex_chars.len() % 2 != 0 {
340                hex_chars.push('0');
341            }
342            
343            // Convert hex to bytes
344            let mut bytes = Vec::new();
345            for chunk in hex_chars.as_bytes().chunks(2) {
346                let hex_str = std::str::from_utf8(chunk).unwrap();
347                let byte = u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
348                    position: self.position,
349                    message: "Invalid hex string".to_string(),
350                })?;
351                bytes.push(byte);
352            }
353            
354            Ok(Token::String(bytes))
355        }
356    }
357    
358    /// Read boolean (true/false)
359    fn read_boolean(&mut self) -> ParseResult<Token> {
360        let word = self.read_word()?;
361        match word.as_str() {
362            "true" => Ok(Token::Boolean(true)),
363            "false" => Ok(Token::Boolean(false)),
364            _ => {
365                // Not a boolean, might be a keyword
366                self.process_keyword(word)
367            }
368        }
369    }
370    
371    /// Read null
372    fn read_null(&mut self) -> ParseResult<Token> {
373        let word = self.read_word()?;
374        if word == "null" {
375            Ok(Token::Null)
376        } else {
377            // Not null, might be a keyword
378            self.process_keyword(word)
379        }
380    }
381    
382    /// Read a number (integer or real)
383    fn read_number(&mut self) -> ParseResult<Token> {
384        let mut number_str = String::new();
385        let mut has_dot = false;
386        
387        // Handle sign - consume it first
388        if let Some(ch) = self.peek_char()? {
389            if ch == b'+' || ch == b'-' {
390                self.consume_char()?;
391                number_str.push(ch as char);
392                
393                // After sign, we must have at least one digit
394                if let Some(next) = self.peek_char()? {
395                    if !next.is_ascii_digit() && next != b'.' {
396                        return Err(ParseError::SyntaxError {
397                            position: self.position,
398                            message: "Expected digit after sign".to_string(),
399                        });
400                    }
401                }
402            }
403        }
404        
405        // Read digits and decimal point
406        while let Some(ch) = self.peek_char()? {
407            match ch {
408                b'0'..=b'9' => {
409                    self.consume_char()?;
410                    number_str.push(ch as char);
411                }
412                b'.' if !has_dot => {
413                    self.consume_char()?;
414                    number_str.push(ch as char);
415                    has_dot = true;
416                }
417                _ => break,
418            }
419        }
420        
421        // Don't try to parse references here - let the parser handle it
422        // References are just "num num R" and can be handled at a higher level
423        
424        // Parse as number
425        if has_dot {
426            let value = number_str.parse::<f64>().map_err(|_| ParseError::SyntaxError {
427                position: self.position,
428                message: format!("Invalid real number: '{}'", number_str),
429            })?;
430            Ok(Token::Real(value))
431        } else {
432            let value = number_str.parse::<i64>().map_err(|_| ParseError::SyntaxError {
433                position: self.position,
434                message: format!("Invalid integer: '{}'", number_str),
435            })?;
436            Ok(Token::Integer(value))
437        }
438    }
439    
440    /// Read a keyword
441    fn read_keyword(&mut self) -> ParseResult<Token> {
442        let word = self.read_word()?;
443        self.process_keyword(word)
444    }
445    
446    /// Process a word as a keyword
447    fn process_keyword(&self, word: String) -> ParseResult<Token> {
448        match word.as_str() {
449            "stream" => Ok(Token::Stream),
450            "endstream" => Ok(Token::EndStream),
451            "obj" => Ok(Token::Obj),
452            "endobj" => Ok(Token::EndObj),
453            "startxref" => Ok(Token::StartXRef),
454            _ => Err(ParseError::SyntaxError {
455                position: self.position,
456                message: format!("Unknown keyword: {}", word),
457            }),
458        }
459    }
460    
461    /// Read a word (sequence of non-delimiter characters)
462    fn read_word(&mut self) -> ParseResult<String> {
463        let mut word = String::new();
464        
465        while let Some(ch) = self.peek_char()? {
466            if ch.is_ascii_whitespace() || 
467               matches!(ch, b'/' | b'<' | b'>' | b'[' | b']' | b'(' | b')' | b'%') {
468                break;
469            }
470            self.consume_char()?;
471            word.push(ch as char);
472        }
473        
474        Ok(word)
475    }
476    
477    /// Read a sequence of digits
478    fn read_digits(&mut self) -> ParseResult<String> {
479        let mut digits = String::new();
480        
481        while let Some(ch) = self.peek_char()? {
482            if ch.is_ascii_digit() {
483                self.consume_char()?;
484                digits.push(ch as char);
485            } else {
486                break;
487            }
488        }
489        
490        Ok(digits)
491    }
492    
493    /// Read a newline sequence (CR, LF, or CRLF)
494    pub fn read_newline(&mut self) -> ParseResult<()> {
495        match self.peek_char()? {
496            Some(b'\r') => {
497                self.consume_char()?;
498                // Check for CRLF
499                if self.peek_char()? == Some(b'\n') {
500                    self.consume_char()?;
501                }
502                Ok(())
503            }
504            Some(b'\n') => {
505                self.consume_char()?;
506                Ok(())
507            }
508            _ => Err(ParseError::SyntaxError {
509                position: self.position,
510                message: "Expected newline".to_string(),
511            }),
512        }
513    }
514    
515    /// Read exactly n bytes
516    pub fn read_bytes(&mut self, n: usize) -> ParseResult<Vec<u8>> {
517        let mut bytes = vec![0u8; n];
518        self.reader.read_exact(&mut bytes)?;
519        self.position += n;
520        Ok(bytes)
521    }
522    
523    /// Read until a specific byte sequence is found
524    pub fn read_until_sequence(&mut self, sequence: &[u8]) -> ParseResult<Vec<u8>> {
525        let mut result = Vec::new();
526        let mut match_pos = 0;
527        
528        while let Some(ch) = self.consume_char()? {
529            result.push(ch);
530            
531            if ch == sequence[match_pos] {
532                match_pos += 1;
533                if match_pos == sequence.len() {
534                    // Found the sequence, remove it from result
535                    result.truncate(result.len() - sequence.len());
536                    break;
537                }
538            } else if ch == sequence[0] {
539                match_pos = 1;
540            } else {
541                match_pos = 0;
542            }
543        }
544        
545        if match_pos < sequence.len() {
546            return Err(ParseError::SyntaxError {
547                position: self.position,
548                message: format!("Sequence {:?} not found", sequence),
549            });
550        }
551        
552        Ok(result)
553    }
554    
555    /// Get current position
556    pub fn position(&self) -> usize {
557        self.position
558    }
559    
560    /// Push back a token to be returned by the next call to next_token
561    pub fn push_token(&mut self, token: Token) {
562        self.token_buffer.push(token);
563    }
564}
565
566#[cfg(test)]
567mod tests {
568    use super::*;
569    use std::io::Cursor;
570    
571    #[test]
572    fn test_lexer_basic_tokens() {
573        // Test positive and negative numbers
574        let input = b"123 -456 3.14 true false null /Name";
575        let mut lexer = Lexer::new(Cursor::new(input));
576        
577        assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
578        assert_eq!(lexer.next_token().unwrap(), Token::Integer(-456));
579        assert_eq!(lexer.next_token().unwrap(), Token::Real(3.14));
580        assert_eq!(lexer.next_token().unwrap(), Token::Boolean(true));
581        assert_eq!(lexer.next_token().unwrap(), Token::Boolean(false));
582        assert_eq!(lexer.next_token().unwrap(), Token::Null);
583        assert_eq!(lexer.next_token().unwrap(), Token::Name("Name".to_string()));
584        assert_eq!(lexer.next_token().unwrap(), Token::Eof);
585    }
586    
587    #[test]
588    fn test_lexer_negative_numbers() {
589        // Test negative numbers without space
590        let input = b"-123 -45.67";
591        let mut lexer = Lexer::new(Cursor::new(input));
592        
593        assert_eq!(lexer.next_token().unwrap(), Token::Integer(-123));
594        assert_eq!(lexer.next_token().unwrap(), Token::Real(-45.67));
595    }
596    
597    #[test]
598    fn test_lexer_strings() {
599        let input = b"(Hello World) <48656C6C6F>";
600        let mut lexer = Lexer::new(Cursor::new(input));
601        
602        assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello World".to_vec()));
603        assert_eq!(lexer.next_token().unwrap(), Token::String(b"Hello".to_vec()));
604    }
605    
606    #[test]
607    fn test_lexer_dictionaries() {
608        let input = b"<< /Type /Page >>";
609        let mut lexer = Lexer::new(Cursor::new(input));
610        
611        assert_eq!(lexer.next_token().unwrap(), Token::DictStart);
612        assert_eq!(lexer.next_token().unwrap(), Token::Name("Type".to_string()));
613        assert_eq!(lexer.next_token().unwrap(), Token::Name("Page".to_string()));
614        assert_eq!(lexer.next_token().unwrap(), Token::DictEnd);
615    }
616    
617    #[test]
618    fn test_lexer_arrays() {
619        let input = b"[1 2 3]";
620        let mut lexer = Lexer::new(Cursor::new(input));
621        
622        assert_eq!(lexer.next_token().unwrap(), Token::ArrayStart);
623        assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
624        assert_eq!(lexer.next_token().unwrap(), Token::Integer(2));
625        assert_eq!(lexer.next_token().unwrap(), Token::Integer(3));
626        assert_eq!(lexer.next_token().unwrap(), Token::ArrayEnd);
627    }
628    
629    
630    #[test]
631    fn test_lexer_references() {
632        let input = b"1 0 R 25 1 R";
633        let mut lexer = Lexer::new(Cursor::new(input));
634        
635        // Now references are parsed as separate tokens
636        assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
637        assert_eq!(lexer.next_token().unwrap(), Token::Integer(0));
638        // 'R' should be parsed as a keyword or name
639        match lexer.next_token().unwrap() {
640            Token::Name(s) if s == "R" => {}, // Could be a name
641            other => panic!("Expected R token, got {:?}", other),
642        }
643        
644        assert_eq!(lexer.next_token().unwrap(), Token::Integer(25));
645        assert_eq!(lexer.next_token().unwrap(), Token::Integer(1));
646        match lexer.next_token().unwrap() {
647            Token::Name(s) if s == "R" => {}, // Could be a name
648            other => panic!("Expected R token, got {:?}", other),
649        }
650    }
651    
652    #[test]
653    fn test_lexer_comments() {
654        let input = b"%PDF-1.7\n123";
655        let mut lexer = Lexer::new(Cursor::new(input));
656        
657        assert_eq!(lexer.next_token().unwrap(), Token::Comment("PDF-1.7".to_string()));
658        assert_eq!(lexer.next_token().unwrap(), Token::Integer(123));
659    }
660}