Skip to main content

justpdf_core/tokenizer/
mod.rs

1pub mod reader;
2pub mod token;
3
4use crate::error::{JustPdfError, Result};
5use reader::{PdfReader, is_pdf_delimiter, is_pdf_regular, is_pdf_whitespace};
6use token::{Keyword, Token};
7
8/// PDF tokenizer: consumes bytes from a `PdfReader` and yields `Token` values.
9pub struct Tokenizer<'a> {
10    reader: PdfReader<'a>,
11}
12
13impl<'a> Tokenizer<'a> {
14    pub fn new(data: &'a [u8]) -> Self {
15        Self {
16            reader: PdfReader::new(data),
17        }
18    }
19
20    pub fn new_at(data: &'a [u8], pos: usize) -> Self {
21        Self {
22            reader: PdfReader::new_at(data, pos),
23        }
24    }
25
26    /// Current byte offset.
27    pub fn pos(&self) -> usize {
28        self.reader.pos()
29    }
30
31    /// Set position.
32    pub fn seek(&mut self, pos: usize) {
33        self.reader.seek(pos);
34    }
35
36    pub fn is_eof(&self) -> bool {
37        self.reader.is_eof()
38    }
39
40    /// Access the underlying reader.
41    pub fn reader(&self) -> &PdfReader<'a> {
42        &self.reader
43    }
44
45    /// Read the next token, skipping whitespace and comments.
46    /// Returns `None` at EOF.
47    pub fn next_token(&mut self) -> Result<Option<Token>> {
48        self.reader.skip_whitespace_and_comments();
49        if self.reader.is_eof() {
50            return Ok(None);
51        }
52
53        let offset = self.reader.pos();
54        let b = self.reader.peek().unwrap();
55
56        match b {
57            // Literal string
58            b'(' => self.read_literal_string(),
59            // Hex string or dict delimiter
60            b'<' => {
61                if self.reader.peek_at(1) == Some(b'<') {
62                    self.reader.advance(2);
63                    Ok(Some(Token::DictBegin))
64                } else {
65                    self.read_hex_string()
66                }
67            }
68            // Dict end or unexpected >
69            b'>' => {
70                if self.reader.peek_at(1) == Some(b'>') {
71                    self.reader.advance(2);
72                    Ok(Some(Token::DictEnd))
73                } else {
74                    self.reader.advance(1);
75                    Err(JustPdfError::InvalidToken {
76                        offset,
77                        detail: "unexpected '>'".into(),
78                    })
79                }
80            }
81            b'[' => {
82                self.reader.advance(1);
83                Ok(Some(Token::ArrayBegin))
84            }
85            b']' => {
86                self.reader.advance(1);
87                Ok(Some(Token::ArrayEnd))
88            }
89            // Name
90            b'/' => self.read_name(),
91            // Number or keyword starting with +/-
92            b'+' | b'-' => self.read_number_or_keyword(),
93            b'0'..=b'9' | b'.' => self.read_number_or_keyword(),
94            // Regular character → keyword or unknown
95            _ if is_pdf_regular(b) => self.read_keyword(),
96            _ => {
97                self.reader.advance(1);
98                Err(JustPdfError::InvalidToken {
99                    offset,
100                    detail: format!("unexpected byte 0x{b:02X}"),
101                })
102            }
103        }
104    }
105
106    /// Read a literal string `(...)` with escape handling and balanced parentheses.
107    fn read_literal_string(&mut self) -> Result<Option<Token>> {
108        let start = self.reader.pos();
109        self.reader.advance(1); // skip '('
110        let mut result = Vec::new();
111        let mut depth: u32 = 1;
112
113        loop {
114            let Some(b) = self.reader.next_byte() else {
115                return Err(JustPdfError::UnexpectedEof { offset: start });
116            };
117            match b {
118                b'(' => {
119                    depth += 1;
120                    result.push(b'(');
121                }
122                b')' => {
123                    depth -= 1;
124                    if depth == 0 {
125                        break;
126                    }
127                    result.push(b')');
128                }
129                b'\\' => {
130                    let Some(esc) = self.reader.next_byte() else {
131                        return Err(JustPdfError::UnexpectedEof { offset: start });
132                    };
133                    match esc {
134                        b'n' => result.push(b'\n'),
135                        b'r' => result.push(b'\r'),
136                        b't' => result.push(b'\t'),
137                        b'b' => result.push(0x08),
138                        b'f' => result.push(0x0C),
139                        b'(' => result.push(b'('),
140                        b')' => result.push(b')'),
141                        b'\\' => result.push(b'\\'),
142                        b'\r' => {
143                            // Line continuation: \<CR> or \<CR><LF>
144                            if self.reader.peek() == Some(b'\n') {
145                                self.reader.advance(1);
146                            }
147                        }
148                        b'\n' => {
149                            // Line continuation: \<LF>
150                        }
151                        b'0'..=b'7' => {
152                            // Octal escape: 1-3 digits
153                            let mut val = esc - b'0';
154                            if let Some(d) = self.reader.peek()
155                                && (b'0'..=b'7').contains(&d)
156                            {
157                                self.reader.advance(1);
158                                val = val * 8 + (d - b'0');
159                                if let Some(d2) = self.reader.peek()
160                                    && (b'0'..=b'7').contains(&d2)
161                                {
162                                    self.reader.advance(1);
163                                    val = val * 8 + (d2 - b'0');
164                                }
165                            }
166                            result.push(val);
167                        }
168                        // Unknown escape: ignore the backslash
169                        _ => result.push(esc),
170                    }
171                }
172                // Normalize line endings to \n
173                b'\r' => {
174                    result.push(b'\n');
175                    if self.reader.peek() == Some(b'\n') {
176                        self.reader.advance(1);
177                    }
178                }
179                _ => result.push(b),
180            }
181        }
182
183        Ok(Some(Token::LiteralString(result)))
184    }
185
186    /// Read a hex string `<...>`.
187    fn read_hex_string(&mut self) -> Result<Option<Token>> {
188        let start = self.reader.pos();
189        self.reader.advance(1); // skip '<'
190        let mut hex_chars = Vec::new();
191
192        loop {
193            let Some(b) = self.reader.next_byte() else {
194                return Err(JustPdfError::UnexpectedEof { offset: start });
195            };
196            match b {
197                b'>' => break,
198                _ if is_pdf_whitespace(b) => continue,
199                _ if b.is_ascii_hexdigit() => hex_chars.push(b),
200                _ => {
201                    return Err(JustPdfError::InvalidToken {
202                        offset: self.reader.pos() - 1,
203                        detail: format!("invalid hex digit 0x{b:02X}"),
204                    });
205                }
206            }
207        }
208
209        // Pad with trailing 0 if odd number of hex chars
210        if hex_chars.len() % 2 != 0 {
211            hex_chars.push(b'0');
212        }
213
214        let mut result = Vec::with_capacity(hex_chars.len() / 2);
215        for pair in hex_chars.chunks(2) {
216            let hi = hex_val(pair[0]);
217            let lo = hex_val(pair[1]);
218            result.push((hi << 4) | lo);
219        }
220
221        Ok(Some(Token::HexString(result)))
222    }
223
224    /// Read a name `/...`.
225    fn read_name(&mut self) -> Result<Option<Token>> {
226        self.reader.advance(1); // skip '/'
227        let mut name = Vec::new();
228
229        while let Some(b) = self.reader.peek() {
230            if is_pdf_whitespace(b) || is_pdf_delimiter(b) {
231                break;
232            }
233            self.reader.advance(1);
234            if b == b'#' {
235                // #XX hex escape in name
236                let h1 = self.reader.next_byte();
237                let h2 = self.reader.next_byte();
238                match (h1, h2) {
239                    (Some(a), Some(b)) if a.is_ascii_hexdigit() && b.is_ascii_hexdigit() => {
240                        name.push((hex_val(a) << 4) | hex_val(b));
241                    }
242                    _ => {
243                        return Err(JustPdfError::InvalidToken {
244                            offset: self.reader.pos() - 2,
245                            detail: "invalid hex escape in name".into(),
246                        });
247                    }
248                }
249            } else {
250                name.push(b);
251            }
252        }
253
254        Ok(Some(Token::Name(name)))
255    }
256
257    /// Read a number (integer or real) or a keyword starting with +/-.
258    fn read_number_or_keyword(&mut self) -> Result<Option<Token>> {
259        let start = self.reader.pos();
260        let mut buf = Vec::new();
261        let mut has_dot = false;
262
263        while let Some(b) = self.reader.peek() {
264            match b {
265                b'0'..=b'9' | b'+' | b'-' => {
266                    buf.push(b);
267                    self.reader.advance(1);
268                }
269                b'.' => {
270                    has_dot = true;
271                    buf.push(b);
272                    self.reader.advance(1);
273                }
274                _ if is_pdf_whitespace(b) || is_pdf_delimiter(b) => break,
275                _ if is_pdf_regular(b) => {
276                    // Not a number, fall back to keyword reading
277                    buf.push(b);
278                    self.reader.advance(1);
279                    while let Some(b) = self.reader.peek() {
280                        if !is_pdf_regular(b) {
281                            break;
282                        }
283                        buf.push(b);
284                        self.reader.advance(1);
285                    }
286                    return self.classify_keyword(&buf, start);
287                }
288                _ => break,
289            }
290        }
291
292        if has_dot {
293            let s = std::str::from_utf8(&buf).unwrap_or("?");
294            match s.parse::<f64>() {
295                Ok(v) => Ok(Some(Token::Real(v))),
296                Err(_) => Err(JustPdfError::InvalidToken {
297                    offset: start,
298                    detail: format!("invalid real number: {s}"),
299                }),
300            }
301        } else {
302            let s = std::str::from_utf8(&buf).unwrap_or("?");
303            match s.parse::<i64>() {
304                Ok(v) => Ok(Some(Token::Integer(v))),
305                Err(_) => Err(JustPdfError::InvalidToken {
306                    offset: start,
307                    detail: format!("invalid integer: {s}"),
308                }),
309            }
310        }
311    }
312
313    /// Read a keyword (sequence of regular characters).
314    fn read_keyword(&mut self) -> Result<Option<Token>> {
315        let start = self.reader.pos();
316        let mut buf = Vec::new();
317
318        while let Some(b) = self.reader.peek() {
319            if !is_pdf_regular(b) {
320                break;
321            }
322            buf.push(b);
323            self.reader.advance(1);
324        }
325
326        self.classify_keyword(&buf, start)
327    }
328
329    fn classify_keyword(&self, buf: &[u8], offset: usize) -> Result<Option<Token>> {
330        if let Some(kw) = Keyword::from_bytes(buf) {
331            Ok(Some(Token::Keyword(kw)))
332        } else {
333            Err(JustPdfError::InvalidToken {
334                offset,
335                detail: format!(
336                    "unknown keyword: {}",
337                    std::str::from_utf8(buf).unwrap_or("<non-utf8>")
338                ),
339            })
340        }
341    }
342}
343
344#[inline]
345fn hex_val(b: u8) -> u8 {
346    match b {
347        b'0'..=b'9' => b - b'0',
348        b'a'..=b'f' => b - b'a' + 10,
349        b'A'..=b'F' => b - b'A' + 10,
350        _ => 0,
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    fn tokenize(input: &[u8]) -> Vec<Token> {
359        let mut t = Tokenizer::new(input);
360        let mut tokens = Vec::new();
361        while let Ok(Some(tok)) = t.next_token() {
362            tokens.push(tok);
363        }
364        tokens
365    }
366
367    #[test]
368    fn test_integer() {
369        assert_eq!(tokenize(b"42"), vec![Token::Integer(42)]);
370        assert_eq!(tokenize(b"-17"), vec![Token::Integer(-17)]);
371        assert_eq!(tokenize(b"+5"), vec![Token::Integer(5)]);
372        assert_eq!(tokenize(b"0"), vec![Token::Integer(0)]);
373    }
374
375    #[test]
376    fn test_real() {
377        assert_eq!(tokenize(b"3.15"), vec![Token::Real(3.15)]);
378        assert_eq!(tokenize(b"-0.5"), vec![Token::Real(-0.5)]);
379        assert_eq!(tokenize(b".25"), vec![Token::Real(0.25)]);
380    }
381
382    #[test]
383    fn test_literal_string() {
384        assert_eq!(
385            tokenize(b"(Hello)"),
386            vec![Token::LiteralString(b"Hello".to_vec())]
387        );
388        assert_eq!(
389            tokenize(b"(Hello\\nWorld)"),
390            vec![Token::LiteralString(b"Hello\nWorld".to_vec())]
391        );
392        // Balanced parens
393        assert_eq!(
394            tokenize(b"(a(b)c)"),
395            vec![Token::LiteralString(b"a(b)c".to_vec())]
396        );
397        // Octal escape
398        assert_eq!(
399            tokenize(b"(\\101)"),
400            vec![Token::LiteralString(b"A".to_vec())]
401        );
402    }
403
404    #[test]
405    fn test_hex_string() {
406        assert_eq!(
407            tokenize(b"<48656C6C6F>"),
408            vec![Token::HexString(b"Hello".to_vec())]
409        );
410        // Odd number of hex digits → trailing 0
411        assert_eq!(tokenize(b"<ABC>"), vec![Token::HexString(vec![0xAB, 0xC0])]);
412        // Whitespace in hex string
413        assert_eq!(
414            tokenize(b"<48 65 6C 6C 6F>"),
415            vec![Token::HexString(b"Hello".to_vec())]
416        );
417    }
418
419    #[test]
420    fn test_name() {
421        assert_eq!(tokenize(b"/Type"), vec![Token::Name(b"Type".to_vec())]);
422        assert_eq!(tokenize(b"/A#42C"), vec![Token::Name(b"ABC".to_vec())]);
423        // Empty name
424        assert_eq!(tokenize(b"/ "), vec![Token::Name(b"".to_vec())]);
425    }
426
427    #[test]
428    fn test_keywords() {
429        assert_eq!(
430            tokenize(b"true false null"),
431            vec![
432                Token::Keyword(Keyword::True),
433                Token::Keyword(Keyword::False),
434                Token::Keyword(Keyword::Null),
435            ]
436        );
437    }
438
439    #[test]
440    fn test_array_dict_delimiters() {
441        assert_eq!(tokenize(b"[ ]"), vec![Token::ArrayBegin, Token::ArrayEnd]);
442        assert_eq!(tokenize(b"<< >>"), vec![Token::DictBegin, Token::DictEnd]);
443    }
444
445    #[test]
446    fn test_comment_skipping() {
447        assert_eq!(
448            tokenize(b"42 % this is a comment\n17"),
449            vec![Token::Integer(42), Token::Integer(17)]
450        );
451    }
452
453    #[test]
454    fn test_mixed_tokens() {
455        let input = b"/Type /Catalog /Pages 2 0 R";
456        let tokens = tokenize(input);
457        assert_eq!(
458            tokens,
459            vec![
460                Token::Name(b"Type".to_vec()),
461                Token::Name(b"Catalog".to_vec()),
462                Token::Name(b"Pages".to_vec()),
463                Token::Integer(2),
464                Token::Integer(0),
465                Token::Keyword(Keyword::R),
466            ]
467        );
468    }
469
470    #[test]
471    fn test_empty_input() {
472        assert_eq!(tokenize(b""), Vec::<Token>::new());
473    }
474
475    #[test]
476    fn test_whitespace_only() {
477        assert_eq!(tokenize(b"   \t\n\r  "), Vec::<Token>::new());
478    }
479}