Skip to main content

jsonc_parser/
scanner.rs

1use std::borrow::Cow;
2
3use crate::string::CharProvider;
4
5use super::common::Range;
6use super::errors::*;
7use super::tokens::Token;
8
9/// Converts text into a stream of tokens.
10pub struct Scanner<'a> {
11  byte_index: usize,
12  token_start: usize,
13  bytes: &'a [u8],
14  current_token: Option<Token<'a>>,
15  file_text: &'a str,
16  allow_single_quoted_strings: bool,
17  allow_hexadecimal_numbers: bool,
18  allow_unary_plus_numbers: bool,
19}
20
21/// Options for the scanner.
22#[derive(Debug)]
23pub struct ScannerOptions {
24  /// Allow single-quoted strings (defaults to `true`).
25  pub allow_single_quoted_strings: bool,
26  /// Allow hexadecimal numbers like 0xFF (defaults to `true`).
27  pub allow_hexadecimal_numbers: bool,
28  /// Allow unary plus sign on numbers like +42 (defaults to `true`).
29  pub allow_unary_plus_numbers: bool,
30}
31
32impl Default for ScannerOptions {
33  fn default() -> Self {
34    Self {
35      allow_single_quoted_strings: true,
36      allow_hexadecimal_numbers: true,
37      allow_unary_plus_numbers: true,
38    }
39  }
40}
41
42impl<'a> Scanner<'a> {
43  /// Creates a new scanner with specific options.
44  pub fn new(file_text: &'a str, options: &ScannerOptions) -> Scanner<'a> {
45    Scanner {
46      byte_index: 0,
47      token_start: 0,
48      bytes: file_text.as_bytes(),
49      current_token: None,
50      file_text,
51      allow_single_quoted_strings: options.allow_single_quoted_strings,
52      allow_hexadecimal_numbers: options.allow_hexadecimal_numbers,
53      allow_unary_plus_numbers: options.allow_unary_plus_numbers,
54    }
55  }
56
57  pub fn file_text(&self) -> &str {
58    self.file_text
59  }
60
61  /// Moves to and returns the next token.
62  pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
63    self.skip_whitespace();
64    self.token_start = self.byte_index;
65    if let Some(&b) = self.bytes.get(self.byte_index) {
66      let token_result = match b {
67        b'{' => {
68          self.byte_index += 1;
69          Ok(Token::OpenBrace)
70        }
71        b'}' => {
72          self.byte_index += 1;
73          Ok(Token::CloseBrace)
74        }
75        b'[' => {
76          self.byte_index += 1;
77          Ok(Token::OpenBracket)
78        }
79        b']' => {
80          self.byte_index += 1;
81          Ok(Token::CloseBracket)
82        }
83        b',' => {
84          self.byte_index += 1;
85          Ok(Token::Comma)
86        }
87        b':' => {
88          self.byte_index += 1;
89          Ok(Token::Colon)
90        }
91        b'\'' => {
92          if self.allow_single_quoted_strings {
93            self.parse_string()
94          } else {
95            Err(self.create_error_for_current_token(ParseErrorKind::SingleQuotedStringsNotAllowed))
96          }
97        }
98        b'"' => self.parse_string(),
99        b'/' => match self.bytes.get(self.byte_index + 1) {
100          Some(b'/') => Ok(self.parse_comment_line()),
101          Some(b'*') => self.parse_comment_block(),
102          _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
103        },
104        b'-' | b'+' | b'0'..=b'9' => self.parse_number(),
105        b't' if self.try_move_word("true") => Ok(Token::Boolean(true)),
106        b'f' if self.try_move_word("false") => Ok(Token::Boolean(false)),
107        b'n' if self.try_move_word("null") => Ok(Token::Null),
108        _ => self.parse_word(),
109      };
110      match token_result {
111        Ok(token) => {
112          self.current_token = Some(token.clone());
113          Ok(Some(token))
114        }
115        Err(err) => Err(err),
116      }
117    } else {
118      self.current_token = None;
119      Ok(None)
120    }
121  }
122
123  /// Gets the start position of the token.
124  pub fn token_start(&self) -> usize {
125    self.token_start
126  }
127
128  /// Gets the end position of the token.
129  pub fn token_end(&self) -> usize {
130    self.byte_index
131  }
132
133  /// Gets the current token.
134  pub fn token(&self) -> Option<Token<'a>> {
135    self.current_token.as_ref().map(|x| x.to_owned())
136  }
137
138  pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
139    let end = if self.byte_index > self.token_start {
140      // token was fully scanned — use the exact token end
141      self.byte_index
142    } else if let Some(c) = self.file_text[self.byte_index..].chars().next() {
143      // scanner hasn't advanced past token_start — cover the current character
144      self.byte_index + c.len_utf8()
145    } else {
146      self.file_text.len()
147    };
148    let range = Range {
149      start: self.token_start,
150      end,
151    };
152    self.create_error_for_range(range, kind)
153  }
154
155  pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
156    self.create_error_for_start(self.byte_index, kind)
157  }
158
159  pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
160    let range = Range {
161      start,
162      end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
163        self.byte_index + c.len_utf8()
164      } else {
165        self.file_text.len()
166      },
167    };
168    self.create_error_for_range(range, kind)
169  }
170
171  pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
172    ParseError::new(range, kind, self.file_text)
173  }
174
175  fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
176    let quote = self.bytes[self.byte_index];
177    let start = self.byte_index + 1;
178
179    // fast path: scan for closing quote or backslash byte-by-byte.
180    // this is safe because quote (0x22/0x27) and backslash (0x5C) are ASCII
181    // and can never appear as continuation bytes in multi-byte UTF-8 sequences.
182    let mut i = start;
183    while i < self.bytes.len() {
184      let b = self.bytes[i];
185      if b == quote {
186        // found closing quote with no escapes
187        let s = &self.file_text[start..i];
188        self.byte_index = i + 1;
189        return Ok(Token::String(Cow::Borrowed(s)));
190      }
191      if b == b'\\' {
192        break;
193      }
194      i += 1;
195    }
196
197    // slow path: handle escape sequences via CharProvider
198    crate::string::parse_string_with_char_provider(self)
199      .map(Token::String)
200      // todo(dsherret): don't convert the error kind to a string here
201      .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
202  }
203
204  fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
205    let start_byte_index = self.byte_index;
206
207    // handle unary plus and unary minus
208    match self.bytes.get(self.byte_index) {
209      Some(b'+') => {
210        if !self.allow_unary_plus_numbers {
211          return Err(self.create_error_for_current_token(ParseErrorKind::UnaryPlusNumbersNotAllowed));
212        }
213        self.byte_index += 1;
214      }
215      Some(b'-') => {
216        self.byte_index += 1;
217      }
218      _ => {}
219    }
220
221    match self.bytes.get(self.byte_index) {
222      Some(b'0') => {
223        self.byte_index += 1;
224
225        // check for hexadecimal literal (0x or 0X)
226        if matches!(self.bytes.get(self.byte_index), Some(b'x' | b'X')) {
227          if !self.allow_hexadecimal_numbers {
228            return Err(self.create_error_for_current_token(ParseErrorKind::HexadecimalNumbersNotAllowed));
229          }
230
231          self.byte_index += 1;
232
233          // must have at least one hex digit
234          if !matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
235            return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
236          }
237
238          while matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
239            self.byte_index += 1;
240          }
241
242          return Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]));
243        }
244      }
245      Some(b'1'..=b'9') => {
246        self.byte_index += 1;
247        while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
248          self.byte_index += 1;
249        }
250      }
251      _ => {
252        return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
253      }
254    }
255
256    if self.bytes.get(self.byte_index) == Some(&b'.') {
257      self.byte_index += 1;
258
259      if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
260        return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
261      }
262
263      while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
264        self.byte_index += 1;
265      }
266    }
267
268    if matches!(self.bytes.get(self.byte_index), Some(b'e' | b'E')) {
269      self.byte_index += 1;
270
271      match self.bytes.get(self.byte_index) {
272        Some(b'-' | b'+') => {
273          self.byte_index += 1;
274          if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
275            return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
276          }
277        }
278        Some(b'0'..=b'9') => {}
279        _ => {
280          return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
281        }
282      }
283
284      while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
285        self.byte_index += 1;
286      }
287    }
288
289    Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]))
290  }
291
292  fn parse_comment_line(&mut self) -> Token<'a> {
293    debug_assert!(self.bytes[self.byte_index] == b'/');
294    self.byte_index += 1;
295    debug_assert!(self.bytes[self.byte_index] == b'/');
296    let start_byte_index = self.byte_index + 1;
297    self.byte_index += 1;
298
299    // scan byte-by-byte for newline; \n (0x0A) and \r (0x0D) are ASCII
300    // and can never appear as UTF-8 continuation bytes
301    while let Some(&b) = self.bytes.get(self.byte_index) {
302      if b == b'\n' {
303        break;
304      }
305      if b == b'\r' && self.bytes.get(self.byte_index + 1) == Some(&b'\n') {
306        break;
307      }
308      self.byte_index += 1;
309    }
310
311    Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
312  }
313
314  fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
315    debug_assert!(self.bytes[self.byte_index] == b'/');
316    self.byte_index += 1;
317    debug_assert!(self.bytes[self.byte_index] == b'*');
318    let start_byte_index = self.byte_index + 1;
319    self.byte_index += 1;
320
321    // scan byte-by-byte for */; both are ASCII and safe to scan through UTF-8
322    loop {
323      match self.bytes.get(self.byte_index) {
324        Some(&b'*') if self.bytes.get(self.byte_index + 1) == Some(&b'/') => {
325          let end_byte_index = self.byte_index;
326          self.byte_index += 2;
327          return Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]));
328        }
329        Some(_) => self.byte_index += 1,
330        None => return Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock)),
331      }
332    }
333  }
334
335  fn skip_whitespace(&mut self) {
336    while let Some(&b) = self.bytes.get(self.byte_index) {
337      if b <= b' ' {
338        match b {
339          b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C => {
340            self.byte_index += 1;
341            continue;
342          }
343          _ => break,
344        }
345      } else if b >= 0x80 {
346        // handle non-ASCII unicode whitespace
347        let c = self.file_text[self.byte_index..].chars().next().unwrap();
348        if c.is_whitespace() {
349          self.byte_index += c.len_utf8();
350          continue;
351        }
352        break;
353      } else {
354        break;
355      }
356    }
357  }
358
359  fn try_move_word(&mut self, text: &str) -> bool {
360    let text_bytes = text.as_bytes();
361    let end = self.byte_index + text_bytes.len();
362    if end > self.bytes.len() {
363      return false;
364    }
365    if &self.bytes[self.byte_index..end] != text_bytes {
366      return false;
367    }
368    // ensure the word is not followed by an alphanumeric character
369    if let Some(&next_byte) = self.bytes.get(end) {
370      if next_byte.is_ascii_alphanumeric() {
371        return false;
372      }
373      // check non-ASCII alphanumeric
374      if next_byte >= 0x80
375        && let Some(c) = self.file_text[end..].chars().next()
376        && c.is_alphanumeric()
377      {
378        return false;
379      }
380    }
381    self.byte_index = end;
382    true
383  }
384
385  fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
386    let start_byte_index = self.byte_index;
387
388    while self.byte_index < self.bytes.len() {
389      let b = self.bytes[self.byte_index];
390      if b < 0x80 {
391        // ASCII fast path
392        if b.is_ascii_whitespace() || b == b':' {
393          break;
394        }
395        if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
396          self.byte_index += 1;
397        } else {
398          return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
399        }
400      } else {
401        // non-ASCII: decode char
402        let c = self.file_text[self.byte_index..].chars().next().unwrap();
403        if c.is_whitespace() {
404          break;
405        }
406        if c.is_alphanumeric() {
407          self.byte_index += c.len_utf8();
408        } else {
409          return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
410        }
411      }
412    }
413
414    if self.byte_index == start_byte_index {
415      return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
416    }
417
418    Ok(Token::Word(&self.file_text[start_byte_index..self.byte_index]))
419  }
420
421  fn current_char(&self) -> Option<char> {
422    let &b = self.bytes.get(self.byte_index)?;
423    if b < 0x80 {
424      Some(b as char)
425    } else {
426      self.file_text[self.byte_index..].chars().next()
427    }
428  }
429
430  fn move_next_char(&mut self) -> Option<char> {
431    if self.byte_index >= self.bytes.len() {
432      return None;
433    }
434    let b = self.bytes[self.byte_index];
435    if b < 0x80 {
436      self.byte_index += 1;
437    } else {
438      let c = self.file_text[self.byte_index..].chars().next().unwrap();
439      self.byte_index += c.len_utf8();
440    }
441    self.current_char()
442  }
443}
444
445impl<'a> CharProvider<'a> for Scanner<'a> {
446  fn current_char(&mut self) -> Option<char> {
447    Scanner::current_char(self)
448  }
449
450  fn move_next_char(&mut self) -> Option<char> {
451    Scanner::move_next_char(self)
452  }
453
454  fn byte_index(&self) -> usize {
455    self.byte_index
456  }
457
458  fn text(&self) -> &'a str {
459    self.file_text
460  }
461}
462
463#[cfg(test)]
464mod tests {
465  use std::borrow::Cow;
466
467  use super::super::tokens::Token;
468  use super::*;
469  use pretty_assertions::assert_eq;
470
471  #[test]
472  fn it_tokenizes_string() {
473    assert_has_tokens(
474      r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
475      vec![
476        Token::String(Cow::Borrowed(r#"t"est"#)),
477        Token::Comma,
478        Token::String(Cow::Borrowed("\t\r\n\n  test\n other")),
479        Token::Comma,
480      ],
481    );
482  }
483
484  #[test]
485  fn it_errors_escaping_single_quote_in_double_quote() {
486    assert_has_error(
487      r#""t\'est""#,
488      "Invalid escape in double quote string on line 1 column 3",
489    );
490  }
491
492  #[test]
493  fn it_tokenizes_single_quote_string() {
494    assert_has_tokens(
495      r#"'t\'est','a',"#,
496      vec![
497        Token::String(Cow::Borrowed(r#"t'est"#)),
498        Token::Comma,
499        Token::String(Cow::Borrowed("a")),
500        Token::Comma,
501      ],
502    );
503  }
504
505  #[test]
506  fn it_errors_escaping_double_quote_in_single_quote() {
507    assert_has_error(
508      r#"'t\"est'"#,
509      "Invalid escape in single quote string on line 1 column 3",
510    );
511  }
512
513  #[test]
514  fn it_errors_for_word_starting_with_invalid_token() {
515    assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
516  }
517
518  #[test]
519  fn it_tokenizes_numbers() {
520    assert_has_tokens(
521      "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
522      vec![
523        Token::Number("0"),
524        Token::Comma,
525        Token::Number("0.123"),
526        Token::Comma,
527        Token::Number("-198"),
528        Token::Comma,
529        Token::Number("0e-345"),
530        Token::Comma,
531        Token::Number("0.3e+025"),
532        Token::Comma,
533        Token::Number("1e1"),
534        Token::Comma,
535      ],
536    );
537  }
538
539  #[test]
540  fn it_tokenizes_hexadecimal_numbers() {
541    assert_has_tokens(
542      "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
543      vec![
544        Token::Number("0x7DF"),
545        Token::Comma,
546        Token::Number("0xFF"),
547        Token::Comma,
548        Token::Number("0x123ABC"),
549        Token::Comma,
550        Token::Number("0xabc"),
551        Token::Comma,
552        Token::Number("0X1F"),
553      ],
554    );
555  }
556
557  #[test]
558  fn it_tokenizes_unary_plus_numbers() {
559    assert_has_tokens(
560      "+42, +0.5, +1e10, +0xFF",
561      vec![
562        Token::Number("+42"),
563        Token::Comma,
564        Token::Number("+0.5"),
565        Token::Comma,
566        Token::Number("+1e10"),
567        Token::Comma,
568        Token::Number("+0xFF"),
569      ],
570    );
571  }
572
573  #[test]
574  fn it_errors_invalid_exponent() {
575    assert_has_error(
576      r#"1ea"#,
577      "Expected plus, minus, or digit in number literal on line 1 column 3",
578    );
579    assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
580  }
581
582  #[test]
583  fn it_tokenizes_simple_tokens() {
584    assert_has_tokens(
585      "{}[],:true,false,null,",
586      vec![
587        Token::OpenBrace,
588        Token::CloseBrace,
589        Token::OpenBracket,
590        Token::CloseBracket,
591        Token::Comma,
592        Token::Colon,
593        Token::Boolean(true),
594        Token::Comma,
595        Token::Boolean(false),
596        Token::Comma,
597        Token::Null,
598        Token::Comma,
599      ],
600    );
601  }
602
603  #[test]
604  fn it_tokenizes_comment_line() {
605    assert_has_tokens(
606      "//test\n//t\r\n// test\n,",
607      vec![
608        Token::CommentLine("test"),
609        Token::CommentLine("t"),
610        Token::CommentLine(" test"),
611        Token::Comma,
612      ],
613    );
614  }
615
616  #[test]
617  fn it_tokenizes_comment_blocks() {
618    assert_has_tokens(
619      "/*test\n *//* test*/,",
620      vec![
621        Token::CommentBlock("test\n "),
622        Token::CommentBlock(" test"),
623        Token::Comma,
624      ],
625    );
626  }
627
628  #[test]
629  fn it_errors_on_invalid_utf8_char_for_issue_6() {
630    assert_has_error(
631      "\"\\uDF06\"",
632      "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
633    );
634  }
635
636  fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
637    let mut scanner = Scanner::new(text, &Default::default());
638    let mut scanned_tokens = Vec::new();
639
640    loop {
641      match scanner.scan() {
642        Ok(Some(token)) => scanned_tokens.push(token),
643        Ok(None) => break,
644        Err(err) => panic!("Error parsing: {:?}", err),
645      }
646    }
647
648    assert_eq!(scanned_tokens, tokens);
649  }
650
651  fn assert_has_error(text: &str, message: &str) {
652    let mut scanner = Scanner::new(text, &Default::default());
653    let mut error_message = String::new();
654
655    loop {
656      match scanner.scan() {
657        Ok(Some(_)) => {}
658        Ok(None) => break,
659        Err(err) => {
660          error_message = err.to_string();
661          break;
662        }
663      }
664    }
665
666    assert_eq!(error_message, message);
667  }
668}