Skip to main content

folio_cos/
tokenizer.rs

1//! PDF tokenizer.
2//!
3//! Breaks a raw byte stream into a sequence of PDF tokens.
4//! Handles all PDF lexical conventions per ISO 32000-2:2020 §7.2.
5
6use folio_core::{FolioError, Result};
7
8/// A PDF token.
9#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11    /// An integer number.
12    Integer(i64),
13    /// A real (floating-point) number.
14    Real(f64),
15    /// A literal string (between parentheses), with escape sequences resolved.
16    LiteralString(Vec<u8>),
17    /// A hexadecimal string (between angle brackets).
18    HexString(Vec<u8>),
19    /// A name (after the leading /).
20    Name(Vec<u8>),
21    /// A keyword (true, false, null, obj, endobj, stream, endstream, xref, trailer, startxref, R, etc.)
22    Keyword(Vec<u8>),
23    /// Start of array: [
24    ArrayBegin,
25    /// End of array: ]
26    ArrayEnd,
27    /// Start of dictionary: <<
28    DictBegin,
29    /// End of dictionary: >>
30    DictEnd,
31}
32
33/// PDF tokenizer operating on a byte slice.
34pub struct Tokenizer<'a> {
35    data: &'a [u8],
36    pos: usize,
37}
38
39impl<'a> Tokenizer<'a> {
40    pub fn new(data: &'a [u8]) -> Self {
41        Self { data, pos: 0 }
42    }
43
44    /// Create a tokenizer starting at a specific offset.
45    pub fn new_at(data: &'a [u8], pos: usize) -> Self {
46        Self { data, pos }
47    }
48
49    /// Current byte position in the data.
50    pub fn pos(&self) -> usize {
51        self.pos
52    }
53
54    /// Set position directly.
55    pub fn set_pos(&mut self, pos: usize) {
56        self.pos = pos;
57    }
58
59    /// Peek at the current byte without consuming it.
60    pub fn peek_byte(&self) -> Option<u8> {
61        self.data.get(self.pos).copied()
62    }
63
64    /// Check if we've reached the end of data.
65    pub fn is_eof(&self) -> bool {
66        self.pos >= self.data.len()
67    }
68
69    /// Get a slice of the underlying data.
70    pub fn data(&self) -> &'a [u8] {
71        self.data
72    }
73
74    /// Read the next token, skipping whitespace and comments.
75    pub fn next_token(&mut self) -> Result<Option<Token>> {
76        self.skip_whitespace_and_comments();
77
78        if self.is_eof() {
79            return Ok(None);
80        }
81
82        let byte = self.data[self.pos];
83
84        match byte {
85            b'(' => self.read_literal_string().map(Some),
86            b'<' => {
87                if self.pos + 1 < self.data.len() && self.data[self.pos + 1] == b'<' {
88                    self.pos += 2;
89                    Ok(Some(Token::DictBegin))
90                } else {
91                    self.read_hex_string().map(Some)
92                }
93            }
94            b'>' => {
95                if self.pos + 1 < self.data.len() && self.data[self.pos + 1] == b'>' {
96                    self.pos += 2;
97                    Ok(Some(Token::DictEnd))
98                } else {
99                    self.pos += 1;
100                    Err(FolioError::Parse {
101                        offset: self.pos as u64 - 1,
102                        message: "Unexpected '>'".into(),
103                    })
104                }
105            }
106            b'[' => {
107                self.pos += 1;
108                Ok(Some(Token::ArrayBegin))
109            }
110            b']' => {
111                self.pos += 1;
112                Ok(Some(Token::ArrayEnd))
113            }
114            b'/' => self.read_name().map(Some),
115            b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number().map(Some),
116            _ => self.read_keyword().map(Some),
117        }
118    }
119
120    /// Skip whitespace and comments.
121    pub fn skip_whitespace_and_comments(&mut self) {
122        while self.pos < self.data.len() {
123            let byte = self.data[self.pos];
124            if is_whitespace(byte) {
125                self.pos += 1;
126            } else if byte == b'%' {
127                // Skip comment until end of line
128                self.pos += 1;
129                while self.pos < self.data.len()
130                    && self.data[self.pos] != b'\n'
131                    && self.data[self.pos] != b'\r'
132                {
133                    self.pos += 1;
134                }
135            } else {
136                break;
137            }
138        }
139    }
140
141    /// Skip whitespace only (no comments).
142    pub fn skip_whitespace(&mut self) {
143        while self.pos < self.data.len() && is_whitespace(self.data[self.pos]) {
144            self.pos += 1;
145        }
146    }
147
148    /// Read a literal string (between parentheses with nesting support).
149    fn read_literal_string(&mut self) -> Result<Token> {
150        debug_assert_eq!(self.data[self.pos], b'(');
151        self.pos += 1; // skip opening (
152
153        let mut result = Vec::new();
154        let mut depth = 1u32;
155
156        while self.pos < self.data.len() {
157            let byte = self.data[self.pos];
158            self.pos += 1;
159
160            match byte {
161                b'(' => {
162                    depth += 1;
163                    result.push(b'(');
164                }
165                b')' => {
166                    depth -= 1;
167                    if depth == 0 {
168                        return Ok(Token::LiteralString(result));
169                    }
170                    result.push(b')');
171                }
172                b'\\' => {
173                    if self.pos >= self.data.len() {
174                        result.push(b'\\');
175                        break;
176                    }
177                    let escaped = self.data[self.pos];
178                    self.pos += 1;
179                    match escaped {
180                        b'n' => result.push(b'\n'),
181                        b'r' => result.push(b'\r'),
182                        b't' => result.push(b'\t'),
183                        b'b' => result.push(0x08),
184                        b'f' => result.push(0x0C),
185                        b'(' => result.push(b'('),
186                        b')' => result.push(b')'),
187                        b'\\' => result.push(b'\\'),
188                        b'\r' => {
189                            // Line continuation: \<CR> or \<CR><LF>
190                            if self.pos < self.data.len() && self.data[self.pos] == b'\n' {
191                                self.pos += 1;
192                            }
193                        }
194                        b'\n' => {
195                            // Line continuation: \<LF>
196                        }
197                        b'0'..=b'7' => {
198                            // Octal character code (1-3 digits)
199                            let mut octal = (escaped - b'0') as u32;
200                            for _ in 0..2 {
201                                if self.pos < self.data.len()
202                                    && self.data[self.pos] >= b'0'
203                                    && self.data[self.pos] <= b'7'
204                                {
205                                    octal = octal * 8 + (self.data[self.pos] - b'0') as u32;
206                                    self.pos += 1;
207                                } else {
208                                    break;
209                                }
210                            }
211                            result.push((octal & 0xFF) as u8);
212                        }
213                        _ => {
214                            // Unknown escape — ignore the backslash per spec
215                            result.push(escaped);
216                        }
217                    }
218                }
219                _ => result.push(byte),
220            }
221        }
222
223        Err(FolioError::Parse {
224            offset: self.pos as u64,
225            message: "Unterminated literal string".into(),
226        })
227    }
228
229    /// Read a hexadecimal string (between < and >).
230    fn read_hex_string(&mut self) -> Result<Token> {
231        debug_assert_eq!(self.data[self.pos], b'<');
232        self.pos += 1; // skip opening <
233
234        let mut hex_bytes = Vec::new();
235
236        while self.pos < self.data.len() {
237            let byte = self.data[self.pos];
238            self.pos += 1;
239
240            match byte {
241                b'>' => {
242                    // Decode hex pairs
243                    let mut result = Vec::with_capacity(hex_bytes.len() / 2);
244                    let mut i = 0;
245                    while i < hex_bytes.len() {
246                        let high = hex_bytes[i];
247                        let low = if i + 1 < hex_bytes.len() {
248                            hex_bytes[i + 1]
249                        } else {
250                            0 // Odd number of digits — pad with 0
251                        };
252                        result.push((high << 4) | low);
253                        i += 2;
254                    }
255                    return Ok(Token::HexString(result));
256                }
257                b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' | b'\x00' => continue,
258                b'0'..=b'9' => hex_bytes.push(byte - b'0'),
259                b'a'..=b'f' => hex_bytes.push(byte - b'a' + 10),
260                b'A'..=b'F' => hex_bytes.push(byte - b'A' + 10),
261                _ => {
262                    return Err(FolioError::Parse {
263                        offset: self.pos as u64 - 1,
264                        message: format!("Invalid hex digit: 0x{:02x}", byte),
265                    });
266                }
267            }
268        }
269
270        Err(FolioError::Parse {
271            offset: self.pos as u64,
272            message: "Unterminated hex string".into(),
273        })
274    }
275
276    /// Read a name object (starts with /).
277    fn read_name(&mut self) -> Result<Token> {
278        debug_assert_eq!(self.data[self.pos], b'/');
279        self.pos += 1; // skip /
280
281        let mut name = Vec::new();
282
283        while self.pos < self.data.len() {
284            let byte = self.data[self.pos];
285
286            if is_whitespace(byte) || is_delimiter(byte) {
287                break;
288            }
289
290            self.pos += 1;
291
292            if byte == b'#' && self.pos + 1 < self.data.len() {
293                // Hex-encoded character: #XX
294                let h1 = hex_val(self.data[self.pos]);
295                let h2 = hex_val(self.data[self.pos + 1]);
296                if let (Some(high), Some(low)) = (h1, h2) {
297                    name.push((high << 4) | low);
298                    self.pos += 2;
299                } else {
300                    name.push(b'#');
301                }
302            } else {
303                name.push(byte);
304            }
305        }
306
307        Ok(Token::Name(name))
308    }
309
310    /// Read a number (integer or real).
311    fn read_number(&mut self) -> Result<Token> {
312        let start = self.pos;
313        let mut has_dot = false;
314
315        // Optional sign
316        if self.pos < self.data.len()
317            && (self.data[self.pos] == b'+' || self.data[self.pos] == b'-')
318        {
319            self.pos += 1;
320        }
321
322        // Digits and optional dot
323        while self.pos < self.data.len() {
324            let byte = self.data[self.pos];
325            match byte {
326                b'0'..=b'9' => self.pos += 1,
327                b'.' if !has_dot => {
328                    has_dot = true;
329                    self.pos += 1;
330                }
331                _ => break,
332            }
333        }
334
335        let num_str =
336            std::str::from_utf8(&self.data[start..self.pos]).map_err(|_| FolioError::Parse {
337                offset: start as u64,
338                message: "Invalid number encoding".into(),
339            })?;
340
341        if has_dot {
342            let val: f64 = num_str.parse().map_err(|_| FolioError::Parse {
343                offset: start as u64,
344                message: format!("Invalid real number: '{}'", num_str),
345            })?;
346            Ok(Token::Real(val))
347        } else {
348            // Try integer first, fall back to real for very large numbers
349            match num_str.parse::<i64>() {
350                Ok(val) => Ok(Token::Integer(val)),
351                Err(_) => {
352                    let val: f64 = num_str.parse().map_err(|_| FolioError::Parse {
353                        offset: start as u64,
354                        message: format!("Invalid number: '{}'", num_str),
355                    })?;
356                    Ok(Token::Real(val))
357                }
358            }
359        }
360    }
361
362    /// Read a keyword (alphabetic sequence).
363    fn read_keyword(&mut self) -> Result<Token> {
364        let start = self.pos;
365        while self.pos < self.data.len() {
366            let byte = self.data[self.pos];
367            if is_whitespace(byte) || is_delimiter(byte) {
368                break;
369            }
370            self.pos += 1;
371        }
372
373        if self.pos == start {
374            return Err(FolioError::Parse {
375                offset: start as u64,
376                message: format!(
377                    "Unexpected byte: 0x{:02x}",
378                    self.data.get(start).copied().unwrap_or(0)
379                ),
380            });
381        }
382
383        Ok(Token::Keyword(self.data[start..self.pos].to_vec()))
384    }
385}
386
387/// Check if a byte is PDF whitespace.
388pub fn is_whitespace(byte: u8) -> bool {
389    matches!(byte, b'\x00' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
390}
391
392/// Check if a byte is a PDF delimiter.
393pub fn is_delimiter(byte: u8) -> bool {
394    matches!(
395        byte,
396        b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
397    )
398}
399
400/// Convert a hex digit byte to its value.
401fn hex_val(byte: u8) -> Option<u8> {
402    match byte {
403        b'0'..=b'9' => Some(byte - b'0'),
404        b'a'..=b'f' => Some(byte - b'a' + 10),
405        b'A'..=b'F' => Some(byte - b'A' + 10),
406        _ => None,
407    }
408}
409
410#[cfg(test)]
411mod tests {
412    use super::*;
413
414    fn tokenize(input: &[u8]) -> Vec<Token> {
415        let mut t = Tokenizer::new(input);
416        let mut tokens = Vec::new();
417        while let Ok(Some(tok)) = t.next_token() {
418            tokens.push(tok);
419        }
420        tokens
421    }
422
423    #[test]
424    fn test_integer() {
425        assert_eq!(tokenize(b"42"), vec![Token::Integer(42)]);
426        assert_eq!(tokenize(b"-17"), vec![Token::Integer(-17)]);
427        assert_eq!(tokenize(b"+5"), vec![Token::Integer(5)]);
428        assert_eq!(tokenize(b"0"), vec![Token::Integer(0)]);
429    }
430
431    #[test]
432    fn test_real() {
433        assert_eq!(tokenize(b"3.14"), vec![Token::Real(3.14)]);
434        assert_eq!(tokenize(b"-0.5"), vec![Token::Real(-0.5)]);
435        assert_eq!(tokenize(b".25"), vec![Token::Real(0.25)]);
436    }
437
438    #[test]
439    fn test_name() {
440        assert_eq!(tokenize(b"/Type"), vec![Token::Name(b"Type".to_vec())]);
441        assert_eq!(tokenize(b"/A#42"), vec![Token::Name(b"AB".to_vec())]);
442    }
443
444    #[test]
445    fn test_literal_string() {
446        assert_eq!(
447            tokenize(b"(Hello)"),
448            vec![Token::LiteralString(b"Hello".to_vec())]
449        );
450        assert_eq!(
451            tokenize(b"(Hello\\nWorld)"),
452            vec![Token::LiteralString(b"Hello\nWorld".to_vec())]
453        );
454        // Nested parens
455        assert_eq!(
456            tokenize(b"(Hello (World))"),
457            vec![Token::LiteralString(b"Hello (World)".to_vec())]
458        );
459    }
460
461    #[test]
462    fn test_hex_string() {
463        assert_eq!(
464            tokenize(b"<48656C6C6F>"),
465            vec![Token::HexString(b"Hello".to_vec())]
466        );
467        assert_eq!(
468            tokenize(b"<48 65 6C>"),
469            vec![Token::HexString(b"Hel".to_vec())]
470        );
471    }
472
473    #[test]
474    fn test_keywords() {
475        assert_eq!(
476            tokenize(b"true false null"),
477            vec![
478                Token::Keyword(b"true".to_vec()),
479                Token::Keyword(b"false".to_vec()),
480                Token::Keyword(b"null".to_vec()),
481            ]
482        );
483    }
484
485    #[test]
486    fn test_delimiters() {
487        assert_eq!(
488            tokenize(b"[1 2]"),
489            vec![
490                Token::ArrayBegin,
491                Token::Integer(1),
492                Token::Integer(2),
493                Token::ArrayEnd,
494            ]
495        );
496        assert_eq!(
497            tokenize(b"<< /Key /Value >>"),
498            vec![
499                Token::DictBegin,
500                Token::Name(b"Key".to_vec()),
501                Token::Name(b"Value".to_vec()),
502                Token::DictEnd,
503            ]
504        );
505    }
506
507    #[test]
508    fn test_comments() {
509        assert_eq!(
510            tokenize(b"42 % this is a comment\n17"),
511            vec![Token::Integer(42), Token::Integer(17)]
512        );
513    }
514
515    #[test]
516    fn test_mixed() {
517        let tokens = tokenize(b"/Type /Page /MediaBox [0 0 612 792]");
518        assert_eq!(tokens.len(), 9);
519        assert_eq!(tokens[0], Token::Name(b"Type".to_vec()));
520        assert_eq!(tokens[1], Token::Name(b"Page".to_vec()));
521        assert_eq!(tokens[2], Token::Name(b"MediaBox".to_vec()));
522        assert_eq!(tokens[3], Token::ArrayBegin);
523        assert_eq!(tokens[8], Token::ArrayEnd);
524    }
525
526    #[test]
527    fn test_octal_escape() {
528        assert_eq!(
529            tokenize(b"(\\110\\145\\154\\154\\157)"),
530            vec![Token::LiteralString(b"Hello".to_vec())]
531        );
532    }
533}