1pub mod reader;
2pub mod token;
3
4use crate::error::{JustPdfError, Result};
5use reader::{PdfReader, is_pdf_delimiter, is_pdf_regular, is_pdf_whitespace};
6use token::{Keyword, Token};
7
8pub struct Tokenizer<'a> {
10 reader: PdfReader<'a>,
11}
12
13impl<'a> Tokenizer<'a> {
14 pub fn new(data: &'a [u8]) -> Self {
15 Self {
16 reader: PdfReader::new(data),
17 }
18 }
19
20 pub fn new_at(data: &'a [u8], pos: usize) -> Self {
21 Self {
22 reader: PdfReader::new_at(data, pos),
23 }
24 }
25
26 pub fn pos(&self) -> usize {
28 self.reader.pos()
29 }
30
31 pub fn seek(&mut self, pos: usize) {
33 self.reader.seek(pos);
34 }
35
36 pub fn is_eof(&self) -> bool {
37 self.reader.is_eof()
38 }
39
40 pub fn reader(&self) -> &PdfReader<'a> {
42 &self.reader
43 }
44
45 pub fn next_token(&mut self) -> Result<Option<Token>> {
48 self.reader.skip_whitespace_and_comments();
49 if self.reader.is_eof() {
50 return Ok(None);
51 }
52
53 let offset = self.reader.pos();
54 let b = self.reader.peek().unwrap();
55
56 match b {
57 b'(' => self.read_literal_string(),
59 b'<' => {
61 if self.reader.peek_at(1) == Some(b'<') {
62 self.reader.advance(2);
63 Ok(Some(Token::DictBegin))
64 } else {
65 self.read_hex_string()
66 }
67 }
68 b'>' => {
70 if self.reader.peek_at(1) == Some(b'>') {
71 self.reader.advance(2);
72 Ok(Some(Token::DictEnd))
73 } else {
74 self.reader.advance(1);
75 Err(JustPdfError::InvalidToken {
76 offset,
77 detail: "unexpected '>'".into(),
78 })
79 }
80 }
81 b'[' => {
82 self.reader.advance(1);
83 Ok(Some(Token::ArrayBegin))
84 }
85 b']' => {
86 self.reader.advance(1);
87 Ok(Some(Token::ArrayEnd))
88 }
89 b'/' => self.read_name(),
91 b'+' | b'-' => self.read_number_or_keyword(),
93 b'0'..=b'9' | b'.' => self.read_number_or_keyword(),
94 _ if is_pdf_regular(b) => self.read_keyword(),
96 _ => {
97 self.reader.advance(1);
98 Err(JustPdfError::InvalidToken {
99 offset,
100 detail: format!("unexpected byte 0x{b:02X}"),
101 })
102 }
103 }
104 }
105
106 fn read_literal_string(&mut self) -> Result<Option<Token>> {
108 let start = self.reader.pos();
109 self.reader.advance(1); let mut result = Vec::new();
111 let mut depth: u32 = 1;
112
113 loop {
114 let Some(b) = self.reader.next_byte() else {
115 return Err(JustPdfError::UnexpectedEof { offset: start });
116 };
117 match b {
118 b'(' => {
119 depth += 1;
120 result.push(b'(');
121 }
122 b')' => {
123 depth -= 1;
124 if depth == 0 {
125 break;
126 }
127 result.push(b')');
128 }
129 b'\\' => {
130 let Some(esc) = self.reader.next_byte() else {
131 return Err(JustPdfError::UnexpectedEof { offset: start });
132 };
133 match esc {
134 b'n' => result.push(b'\n'),
135 b'r' => result.push(b'\r'),
136 b't' => result.push(b'\t'),
137 b'b' => result.push(0x08),
138 b'f' => result.push(0x0C),
139 b'(' => result.push(b'('),
140 b')' => result.push(b')'),
141 b'\\' => result.push(b'\\'),
142 b'\r' => {
143 if self.reader.peek() == Some(b'\n') {
145 self.reader.advance(1);
146 }
147 }
148 b'\n' => {
149 }
151 b'0'..=b'7' => {
152 let mut val = esc - b'0';
154 if let Some(d) = self.reader.peek()
155 && (b'0'..=b'7').contains(&d)
156 {
157 self.reader.advance(1);
158 val = val * 8 + (d - b'0');
159 if let Some(d2) = self.reader.peek()
160 && (b'0'..=b'7').contains(&d2)
161 {
162 self.reader.advance(1);
163 val = val * 8 + (d2 - b'0');
164 }
165 }
166 result.push(val);
167 }
168 _ => result.push(esc),
170 }
171 }
172 b'\r' => {
174 result.push(b'\n');
175 if self.reader.peek() == Some(b'\n') {
176 self.reader.advance(1);
177 }
178 }
179 _ => result.push(b),
180 }
181 }
182
183 Ok(Some(Token::LiteralString(result)))
184 }
185
186 fn read_hex_string(&mut self) -> Result<Option<Token>> {
188 let start = self.reader.pos();
189 self.reader.advance(1); let mut hex_chars = Vec::new();
191
192 loop {
193 let Some(b) = self.reader.next_byte() else {
194 return Err(JustPdfError::UnexpectedEof { offset: start });
195 };
196 match b {
197 b'>' => break,
198 _ if is_pdf_whitespace(b) => continue,
199 _ if b.is_ascii_hexdigit() => hex_chars.push(b),
200 _ => {
201 return Err(JustPdfError::InvalidToken {
202 offset: self.reader.pos() - 1,
203 detail: format!("invalid hex digit 0x{b:02X}"),
204 });
205 }
206 }
207 }
208
209 if hex_chars.len() % 2 != 0 {
211 hex_chars.push(b'0');
212 }
213
214 let mut result = Vec::with_capacity(hex_chars.len() / 2);
215 for pair in hex_chars.chunks(2) {
216 let hi = hex_val(pair[0]);
217 let lo = hex_val(pair[1]);
218 result.push((hi << 4) | lo);
219 }
220
221 Ok(Some(Token::HexString(result)))
222 }
223
224 fn read_name(&mut self) -> Result<Option<Token>> {
226 self.reader.advance(1); let mut name = Vec::new();
228
229 while let Some(b) = self.reader.peek() {
230 if is_pdf_whitespace(b) || is_pdf_delimiter(b) {
231 break;
232 }
233 self.reader.advance(1);
234 if b == b'#' {
235 let h1 = self.reader.next_byte();
237 let h2 = self.reader.next_byte();
238 match (h1, h2) {
239 (Some(a), Some(b)) if a.is_ascii_hexdigit() && b.is_ascii_hexdigit() => {
240 name.push((hex_val(a) << 4) | hex_val(b));
241 }
242 _ => {
243 return Err(JustPdfError::InvalidToken {
244 offset: self.reader.pos() - 2,
245 detail: "invalid hex escape in name".into(),
246 });
247 }
248 }
249 } else {
250 name.push(b);
251 }
252 }
253
254 Ok(Some(Token::Name(name)))
255 }
256
257 fn read_number_or_keyword(&mut self) -> Result<Option<Token>> {
259 let start = self.reader.pos();
260 let mut buf = Vec::new();
261 let mut has_dot = false;
262
263 while let Some(b) = self.reader.peek() {
264 match b {
265 b'0'..=b'9' | b'+' | b'-' => {
266 buf.push(b);
267 self.reader.advance(1);
268 }
269 b'.' => {
270 has_dot = true;
271 buf.push(b);
272 self.reader.advance(1);
273 }
274 _ if is_pdf_whitespace(b) || is_pdf_delimiter(b) => break,
275 _ if is_pdf_regular(b) => {
276 buf.push(b);
278 self.reader.advance(1);
279 while let Some(b) = self.reader.peek() {
280 if !is_pdf_regular(b) {
281 break;
282 }
283 buf.push(b);
284 self.reader.advance(1);
285 }
286 return self.classify_keyword(&buf, start);
287 }
288 _ => break,
289 }
290 }
291
292 if has_dot {
293 let s = std::str::from_utf8(&buf).unwrap_or("?");
294 match s.parse::<f64>() {
295 Ok(v) => Ok(Some(Token::Real(v))),
296 Err(_) => Err(JustPdfError::InvalidToken {
297 offset: start,
298 detail: format!("invalid real number: {s}"),
299 }),
300 }
301 } else {
302 let s = std::str::from_utf8(&buf).unwrap_or("?");
303 match s.parse::<i64>() {
304 Ok(v) => Ok(Some(Token::Integer(v))),
305 Err(_) => Err(JustPdfError::InvalidToken {
306 offset: start,
307 detail: format!("invalid integer: {s}"),
308 }),
309 }
310 }
311 }
312
313 fn read_keyword(&mut self) -> Result<Option<Token>> {
315 let start = self.reader.pos();
316 let mut buf = Vec::new();
317
318 while let Some(b) = self.reader.peek() {
319 if !is_pdf_regular(b) {
320 break;
321 }
322 buf.push(b);
323 self.reader.advance(1);
324 }
325
326 self.classify_keyword(&buf, start)
327 }
328
329 fn classify_keyword(&self, buf: &[u8], offset: usize) -> Result<Option<Token>> {
330 if let Some(kw) = Keyword::from_bytes(buf) {
331 Ok(Some(Token::Keyword(kw)))
332 } else {
333 Err(JustPdfError::InvalidToken {
334 offset,
335 detail: format!(
336 "unknown keyword: {}",
337 std::str::from_utf8(buf).unwrap_or("<non-utf8>")
338 ),
339 })
340 }
341 }
342}
343
344#[inline]
345fn hex_val(b: u8) -> u8 {
346 match b {
347 b'0'..=b'9' => b - b'0',
348 b'a'..=b'f' => b - b'a' + 10,
349 b'A'..=b'F' => b - b'A' + 10,
350 _ => 0,
351 }
352}
353
354#[cfg(test)]
355mod tests {
356 use super::*;
357
358 fn tokenize(input: &[u8]) -> Vec<Token> {
359 let mut t = Tokenizer::new(input);
360 let mut tokens = Vec::new();
361 while let Ok(Some(tok)) = t.next_token() {
362 tokens.push(tok);
363 }
364 tokens
365 }
366
367 #[test]
368 fn test_integer() {
369 assert_eq!(tokenize(b"42"), vec![Token::Integer(42)]);
370 assert_eq!(tokenize(b"-17"), vec![Token::Integer(-17)]);
371 assert_eq!(tokenize(b"+5"), vec![Token::Integer(5)]);
372 assert_eq!(tokenize(b"0"), vec![Token::Integer(0)]);
373 }
374
375 #[test]
376 fn test_real() {
377 assert_eq!(tokenize(b"3.15"), vec![Token::Real(3.15)]);
378 assert_eq!(tokenize(b"-0.5"), vec![Token::Real(-0.5)]);
379 assert_eq!(tokenize(b".25"), vec![Token::Real(0.25)]);
380 }
381
382 #[test]
383 fn test_literal_string() {
384 assert_eq!(
385 tokenize(b"(Hello)"),
386 vec![Token::LiteralString(b"Hello".to_vec())]
387 );
388 assert_eq!(
389 tokenize(b"(Hello\\nWorld)"),
390 vec![Token::LiteralString(b"Hello\nWorld".to_vec())]
391 );
392 assert_eq!(
394 tokenize(b"(a(b)c)"),
395 vec![Token::LiteralString(b"a(b)c".to_vec())]
396 );
397 assert_eq!(
399 tokenize(b"(\\101)"),
400 vec![Token::LiteralString(b"A".to_vec())]
401 );
402 }
403
404 #[test]
405 fn test_hex_string() {
406 assert_eq!(
407 tokenize(b"<48656C6C6F>"),
408 vec![Token::HexString(b"Hello".to_vec())]
409 );
410 assert_eq!(tokenize(b"<ABC>"), vec![Token::HexString(vec![0xAB, 0xC0])]);
412 assert_eq!(
414 tokenize(b"<48 65 6C 6C 6F>"),
415 vec![Token::HexString(b"Hello".to_vec())]
416 );
417 }
418
419 #[test]
420 fn test_name() {
421 assert_eq!(tokenize(b"/Type"), vec![Token::Name(b"Type".to_vec())]);
422 assert_eq!(tokenize(b"/A#42C"), vec![Token::Name(b"ABC".to_vec())]);
423 assert_eq!(tokenize(b"/ "), vec![Token::Name(b"".to_vec())]);
425 }
426
427 #[test]
428 fn test_keywords() {
429 assert_eq!(
430 tokenize(b"true false null"),
431 vec![
432 Token::Keyword(Keyword::True),
433 Token::Keyword(Keyword::False),
434 Token::Keyword(Keyword::Null),
435 ]
436 );
437 }
438
439 #[test]
440 fn test_array_dict_delimiters() {
441 assert_eq!(tokenize(b"[ ]"), vec![Token::ArrayBegin, Token::ArrayEnd]);
442 assert_eq!(tokenize(b"<< >>"), vec![Token::DictBegin, Token::DictEnd]);
443 }
444
445 #[test]
446 fn test_comment_skipping() {
447 assert_eq!(
448 tokenize(b"42 % this is a comment\n17"),
449 vec![Token::Integer(42), Token::Integer(17)]
450 );
451 }
452
453 #[test]
454 fn test_mixed_tokens() {
455 let input = b"/Type /Catalog /Pages 2 0 R";
456 let tokens = tokenize(input);
457 assert_eq!(
458 tokens,
459 vec![
460 Token::Name(b"Type".to_vec()),
461 Token::Name(b"Catalog".to_vec()),
462 Token::Name(b"Pages".to_vec()),
463 Token::Integer(2),
464 Token::Integer(0),
465 Token::Keyword(Keyword::R),
466 ]
467 );
468 }
469
470 #[test]
471 fn test_empty_input() {
472 assert_eq!(tokenize(b""), Vec::<Token>::new());
473 }
474
475 #[test]
476 fn test_whitespace_only() {
477 assert_eq!(tokenize(b" \t\n\r "), Vec::<Token>::new());
478 }
479}