1use folio_core::{FolioError, Result};
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum Token {
11 Integer(i64),
13 Real(f64),
15 LiteralString(Vec<u8>),
17 HexString(Vec<u8>),
19 Name(Vec<u8>),
21 Keyword(Vec<u8>),
23 ArrayBegin,
25 ArrayEnd,
27 DictBegin,
29 DictEnd,
31}
32
33pub struct Tokenizer<'a> {
35 data: &'a [u8],
36 pos: usize,
37}
38
39impl<'a> Tokenizer<'a> {
40 pub fn new(data: &'a [u8]) -> Self {
41 Self { data, pos: 0 }
42 }
43
44 pub fn new_at(data: &'a [u8], pos: usize) -> Self {
46 Self { data, pos }
47 }
48
49 pub fn pos(&self) -> usize {
51 self.pos
52 }
53
54 pub fn set_pos(&mut self, pos: usize) {
56 self.pos = pos;
57 }
58
59 pub fn peek_byte(&self) -> Option<u8> {
61 self.data.get(self.pos).copied()
62 }
63
64 pub fn is_eof(&self) -> bool {
66 self.pos >= self.data.len()
67 }
68
69 pub fn data(&self) -> &'a [u8] {
71 self.data
72 }
73
74 pub fn next_token(&mut self) -> Result<Option<Token>> {
76 self.skip_whitespace_and_comments();
77
78 if self.is_eof() {
79 return Ok(None);
80 }
81
82 let byte = self.data[self.pos];
83
84 match byte {
85 b'(' => self.read_literal_string().map(Some),
86 b'<' => {
87 if self.pos + 1 < self.data.len() && self.data[self.pos + 1] == b'<' {
88 self.pos += 2;
89 Ok(Some(Token::DictBegin))
90 } else {
91 self.read_hex_string().map(Some)
92 }
93 }
94 b'>' => {
95 if self.pos + 1 < self.data.len() && self.data[self.pos + 1] == b'>' {
96 self.pos += 2;
97 Ok(Some(Token::DictEnd))
98 } else {
99 self.pos += 1;
100 Err(FolioError::Parse {
101 offset: self.pos as u64 - 1,
102 message: "Unexpected '>'".into(),
103 })
104 }
105 }
106 b'[' => {
107 self.pos += 1;
108 Ok(Some(Token::ArrayBegin))
109 }
110 b']' => {
111 self.pos += 1;
112 Ok(Some(Token::ArrayEnd))
113 }
114 b'/' => self.read_name().map(Some),
115 b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number().map(Some),
116 _ => self.read_keyword().map(Some),
117 }
118 }
119
120 pub fn skip_whitespace_and_comments(&mut self) {
122 while self.pos < self.data.len() {
123 let byte = self.data[self.pos];
124 if is_whitespace(byte) {
125 self.pos += 1;
126 } else if byte == b'%' {
127 self.pos += 1;
129 while self.pos < self.data.len()
130 && self.data[self.pos] != b'\n'
131 && self.data[self.pos] != b'\r'
132 {
133 self.pos += 1;
134 }
135 } else {
136 break;
137 }
138 }
139 }
140
141 pub fn skip_whitespace(&mut self) {
143 while self.pos < self.data.len() && is_whitespace(self.data[self.pos]) {
144 self.pos += 1;
145 }
146 }
147
148 fn read_literal_string(&mut self) -> Result<Token> {
150 debug_assert_eq!(self.data[self.pos], b'(');
151 self.pos += 1; let mut result = Vec::new();
154 let mut depth = 1u32;
155
156 while self.pos < self.data.len() {
157 let byte = self.data[self.pos];
158 self.pos += 1;
159
160 match byte {
161 b'(' => {
162 depth += 1;
163 result.push(b'(');
164 }
165 b')' => {
166 depth -= 1;
167 if depth == 0 {
168 return Ok(Token::LiteralString(result));
169 }
170 result.push(b')');
171 }
172 b'\\' => {
173 if self.pos >= self.data.len() {
174 result.push(b'\\');
175 break;
176 }
177 let escaped = self.data[self.pos];
178 self.pos += 1;
179 match escaped {
180 b'n' => result.push(b'\n'),
181 b'r' => result.push(b'\r'),
182 b't' => result.push(b'\t'),
183 b'b' => result.push(0x08),
184 b'f' => result.push(0x0C),
185 b'(' => result.push(b'('),
186 b')' => result.push(b')'),
187 b'\\' => result.push(b'\\'),
188 b'\r' => {
189 if self.pos < self.data.len() && self.data[self.pos] == b'\n' {
191 self.pos += 1;
192 }
193 }
194 b'\n' => {
195 }
197 b'0'..=b'7' => {
198 let mut octal = (escaped - b'0') as u32;
200 for _ in 0..2 {
201 if self.pos < self.data.len()
202 && self.data[self.pos] >= b'0'
203 && self.data[self.pos] <= b'7'
204 {
205 octal = octal * 8 + (self.data[self.pos] - b'0') as u32;
206 self.pos += 1;
207 } else {
208 break;
209 }
210 }
211 result.push((octal & 0xFF) as u8);
212 }
213 _ => {
214 result.push(escaped);
216 }
217 }
218 }
219 _ => result.push(byte),
220 }
221 }
222
223 Err(FolioError::Parse {
224 offset: self.pos as u64,
225 message: "Unterminated literal string".into(),
226 })
227 }
228
229 fn read_hex_string(&mut self) -> Result<Token> {
231 debug_assert_eq!(self.data[self.pos], b'<');
232 self.pos += 1; let mut hex_bytes = Vec::new();
235
236 while self.pos < self.data.len() {
237 let byte = self.data[self.pos];
238 self.pos += 1;
239
240 match byte {
241 b'>' => {
242 let mut result = Vec::with_capacity(hex_bytes.len() / 2);
244 let mut i = 0;
245 while i < hex_bytes.len() {
246 let high = hex_bytes[i];
247 let low = if i + 1 < hex_bytes.len() {
248 hex_bytes[i + 1]
249 } else {
250 0 };
252 result.push((high << 4) | low);
253 i += 2;
254 }
255 return Ok(Token::HexString(result));
256 }
257 b' ' | b'\t' | b'\n' | b'\r' | b'\x0c' | b'\x00' => continue,
258 b'0'..=b'9' => hex_bytes.push(byte - b'0'),
259 b'a'..=b'f' => hex_bytes.push(byte - b'a' + 10),
260 b'A'..=b'F' => hex_bytes.push(byte - b'A' + 10),
261 _ => {
262 return Err(FolioError::Parse {
263 offset: self.pos as u64 - 1,
264 message: format!("Invalid hex digit: 0x{:02x}", byte),
265 });
266 }
267 }
268 }
269
270 Err(FolioError::Parse {
271 offset: self.pos as u64,
272 message: "Unterminated hex string".into(),
273 })
274 }
275
276 fn read_name(&mut self) -> Result<Token> {
278 debug_assert_eq!(self.data[self.pos], b'/');
279 self.pos += 1; let mut name = Vec::new();
282
283 while self.pos < self.data.len() {
284 let byte = self.data[self.pos];
285
286 if is_whitespace(byte) || is_delimiter(byte) {
287 break;
288 }
289
290 self.pos += 1;
291
292 if byte == b'#' && self.pos + 1 < self.data.len() {
293 let h1 = hex_val(self.data[self.pos]);
295 let h2 = hex_val(self.data[self.pos + 1]);
296 if let (Some(high), Some(low)) = (h1, h2) {
297 name.push((high << 4) | low);
298 self.pos += 2;
299 } else {
300 name.push(b'#');
301 }
302 } else {
303 name.push(byte);
304 }
305 }
306
307 Ok(Token::Name(name))
308 }
309
310 fn read_number(&mut self) -> Result<Token> {
312 let start = self.pos;
313 let mut has_dot = false;
314
315 if self.pos < self.data.len()
317 && (self.data[self.pos] == b'+' || self.data[self.pos] == b'-')
318 {
319 self.pos += 1;
320 }
321
322 while self.pos < self.data.len() {
324 let byte = self.data[self.pos];
325 match byte {
326 b'0'..=b'9' => self.pos += 1,
327 b'.' if !has_dot => {
328 has_dot = true;
329 self.pos += 1;
330 }
331 _ => break,
332 }
333 }
334
335 let num_str =
336 std::str::from_utf8(&self.data[start..self.pos]).map_err(|_| FolioError::Parse {
337 offset: start as u64,
338 message: "Invalid number encoding".into(),
339 })?;
340
341 if has_dot {
342 let val: f64 = num_str.parse().map_err(|_| FolioError::Parse {
343 offset: start as u64,
344 message: format!("Invalid real number: '{}'", num_str),
345 })?;
346 Ok(Token::Real(val))
347 } else {
348 match num_str.parse::<i64>() {
350 Ok(val) => Ok(Token::Integer(val)),
351 Err(_) => {
352 let val: f64 = num_str.parse().map_err(|_| FolioError::Parse {
353 offset: start as u64,
354 message: format!("Invalid number: '{}'", num_str),
355 })?;
356 Ok(Token::Real(val))
357 }
358 }
359 }
360 }
361
362 fn read_keyword(&mut self) -> Result<Token> {
364 let start = self.pos;
365 while self.pos < self.data.len() {
366 let byte = self.data[self.pos];
367 if is_whitespace(byte) || is_delimiter(byte) {
368 break;
369 }
370 self.pos += 1;
371 }
372
373 if self.pos == start {
374 return Err(FolioError::Parse {
375 offset: start as u64,
376 message: format!(
377 "Unexpected byte: 0x{:02x}",
378 self.data.get(start).copied().unwrap_or(0)
379 ),
380 });
381 }
382
383 Ok(Token::Keyword(self.data[start..self.pos].to_vec()))
384 }
385}
386
387pub fn is_whitespace(byte: u8) -> bool {
389 matches!(byte, b'\x00' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
390}
391
392pub fn is_delimiter(byte: u8) -> bool {
394 matches!(
395 byte,
396 b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%'
397 )
398}
399
400fn hex_val(byte: u8) -> Option<u8> {
402 match byte {
403 b'0'..=b'9' => Some(byte - b'0'),
404 b'a'..=b'f' => Some(byte - b'a' + 10),
405 b'A'..=b'F' => Some(byte - b'A' + 10),
406 _ => None,
407 }
408}
409
410#[cfg(test)]
411mod tests {
412 use super::*;
413
414 fn tokenize(input: &[u8]) -> Vec<Token> {
415 let mut t = Tokenizer::new(input);
416 let mut tokens = Vec::new();
417 while let Ok(Some(tok)) = t.next_token() {
418 tokens.push(tok);
419 }
420 tokens
421 }
422
423 #[test]
424 fn test_integer() {
425 assert_eq!(tokenize(b"42"), vec![Token::Integer(42)]);
426 assert_eq!(tokenize(b"-17"), vec![Token::Integer(-17)]);
427 assert_eq!(tokenize(b"+5"), vec![Token::Integer(5)]);
428 assert_eq!(tokenize(b"0"), vec![Token::Integer(0)]);
429 }
430
431 #[test]
432 fn test_real() {
433 assert_eq!(tokenize(b"3.14"), vec![Token::Real(3.14)]);
434 assert_eq!(tokenize(b"-0.5"), vec![Token::Real(-0.5)]);
435 assert_eq!(tokenize(b".25"), vec![Token::Real(0.25)]);
436 }
437
438 #[test]
439 fn test_name() {
440 assert_eq!(tokenize(b"/Type"), vec![Token::Name(b"Type".to_vec())]);
441 assert_eq!(tokenize(b"/A#42"), vec![Token::Name(b"AB".to_vec())]);
442 }
443
444 #[test]
445 fn test_literal_string() {
446 assert_eq!(
447 tokenize(b"(Hello)"),
448 vec![Token::LiteralString(b"Hello".to_vec())]
449 );
450 assert_eq!(
451 tokenize(b"(Hello\\nWorld)"),
452 vec![Token::LiteralString(b"Hello\nWorld".to_vec())]
453 );
454 assert_eq!(
456 tokenize(b"(Hello (World))"),
457 vec![Token::LiteralString(b"Hello (World)".to_vec())]
458 );
459 }
460
461 #[test]
462 fn test_hex_string() {
463 assert_eq!(
464 tokenize(b"<48656C6C6F>"),
465 vec![Token::HexString(b"Hello".to_vec())]
466 );
467 assert_eq!(
468 tokenize(b"<48 65 6C>"),
469 vec![Token::HexString(b"Hel".to_vec())]
470 );
471 }
472
473 #[test]
474 fn test_keywords() {
475 assert_eq!(
476 tokenize(b"true false null"),
477 vec![
478 Token::Keyword(b"true".to_vec()),
479 Token::Keyword(b"false".to_vec()),
480 Token::Keyword(b"null".to_vec()),
481 ]
482 );
483 }
484
485 #[test]
486 fn test_delimiters() {
487 assert_eq!(
488 tokenize(b"[1 2]"),
489 vec![
490 Token::ArrayBegin,
491 Token::Integer(1),
492 Token::Integer(2),
493 Token::ArrayEnd,
494 ]
495 );
496 assert_eq!(
497 tokenize(b"<< /Key /Value >>"),
498 vec![
499 Token::DictBegin,
500 Token::Name(b"Key".to_vec()),
501 Token::Name(b"Value".to_vec()),
502 Token::DictEnd,
503 ]
504 );
505 }
506
507 #[test]
508 fn test_comments() {
509 assert_eq!(
510 tokenize(b"42 % this is a comment\n17"),
511 vec![Token::Integer(42), Token::Integer(17)]
512 );
513 }
514
515 #[test]
516 fn test_mixed() {
517 let tokens = tokenize(b"/Type /Page /MediaBox [0 0 612 792]");
518 assert_eq!(tokens.len(), 9);
519 assert_eq!(tokens[0], Token::Name(b"Type".to_vec()));
520 assert_eq!(tokens[1], Token::Name(b"Page".to_vec()));
521 assert_eq!(tokens[2], Token::Name(b"MediaBox".to_vec()));
522 assert_eq!(tokens[3], Token::ArrayBegin);
523 assert_eq!(tokens[8], Token::ArrayEnd);
524 }
525
526 #[test]
527 fn test_octal_escape() {
528 assert_eq!(
529 tokenize(b"(\\110\\145\\154\\154\\157)"),
530 vec![Token::LiteralString(b"Hello".to_vec())]
531 );
532 }
533}