Skip to main content

jsonc_parser/
scanner.rs

1use std::borrow::Cow;
2
3use crate::string::CharProvider;
4
5use super::common::Range;
6use super::errors::*;
7use super::tokens::Token;
8
9/// Converts text into a stream of tokens.
10pub struct Scanner<'a> {
11  byte_index: usize,
12  token_start: usize,
13  bytes: &'a [u8],
14  current_token: Option<Token<'a>>,
15  file_text: &'a str,
16  allow_single_quoted_strings: bool,
17  allow_hexadecimal_numbers: bool,
18  allow_unary_plus_numbers: bool,
19}
20
21/// Options for the scanner.
22#[derive(Debug)]
23pub struct ScannerOptions {
24  /// Allow single-quoted strings (defaults to `true`).
25  pub allow_single_quoted_strings: bool,
26  /// Allow hexadecimal numbers like 0xFF (defaults to `true`).
27  pub allow_hexadecimal_numbers: bool,
28  /// Allow unary plus sign on numbers like +42 (defaults to `true`).
29  pub allow_unary_plus_numbers: bool,
30}
31
32impl Default for ScannerOptions {
33  fn default() -> Self {
34    Self {
35      allow_single_quoted_strings: true,
36      allow_hexadecimal_numbers: true,
37      allow_unary_plus_numbers: true,
38    }
39  }
40}
41
42impl<'a> Scanner<'a> {
43  /// Creates a new scanner with specific options.
44  pub fn new(file_text: &'a str, options: &ScannerOptions) -> Scanner<'a> {
45    Scanner {
46      byte_index: 0,
47      token_start: 0,
48      bytes: file_text.as_bytes(),
49      current_token: None,
50      file_text,
51      allow_single_quoted_strings: options.allow_single_quoted_strings,
52      allow_hexadecimal_numbers: options.allow_hexadecimal_numbers,
53      allow_unary_plus_numbers: options.allow_unary_plus_numbers,
54    }
55  }
56
57  pub fn file_text(&self) -> &str {
58    self.file_text
59  }
60
61  /// Moves to and returns the next token.
62  pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
63    self.skip_whitespace();
64    self.token_start = self.byte_index;
65    if let Some(&b) = self.bytes.get(self.byte_index) {
66      let token_result = match b {
67        b'{' => {
68          self.byte_index += 1;
69          Ok(Token::OpenBrace)
70        }
71        b'}' => {
72          self.byte_index += 1;
73          Ok(Token::CloseBrace)
74        }
75        b'[' => {
76          self.byte_index += 1;
77          Ok(Token::OpenBracket)
78        }
79        b']' => {
80          self.byte_index += 1;
81          Ok(Token::CloseBracket)
82        }
83        b',' => {
84          self.byte_index += 1;
85          Ok(Token::Comma)
86        }
87        b':' => {
88          self.byte_index += 1;
89          Ok(Token::Colon)
90        }
91        b'\'' => {
92          if self.allow_single_quoted_strings {
93            self.parse_string()
94          } else {
95            Err(self.create_error_for_current_token(ParseErrorKind::SingleQuotedStringsNotAllowed))
96          }
97        }
98        b'"' => self.parse_string(),
99        b'/' => match self.bytes.get(self.byte_index + 1) {
100          Some(b'/') => Ok(self.parse_comment_line()),
101          Some(b'*') => self.parse_comment_block(),
102          _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
103        },
104        b'-' | b'+' | b'0'..=b'9' => self.parse_number(),
105        b't' if self.try_move_word("true") => Ok(Token::Boolean(true)),
106        b'f' if self.try_move_word("false") => Ok(Token::Boolean(false)),
107        b'n' if self.try_move_word("null") => Ok(Token::Null),
108        _ => self.parse_word(),
109      };
110      match token_result {
111        Ok(token) => {
112          self.current_token = Some(token.clone());
113          Ok(Some(token))
114        }
115        Err(err) => Err(err),
116      }
117    } else {
118      self.current_token = None;
119      Ok(None)
120    }
121  }
122
123  /// Gets the start position of the token.
124  pub fn token_start(&self) -> usize {
125    self.token_start
126  }
127
128  /// Gets the end position of the token.
129  pub fn token_end(&self) -> usize {
130    self.byte_index
131  }
132
133  /// Gets the current token.
134  pub fn token(&self) -> Option<Token<'a>> {
135    self.current_token.as_ref().map(|x| x.to_owned())
136  }
137
138  pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
139    self.create_error_for_start(self.token_start, kind)
140  }
141
142  pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
143    self.create_error_for_start(self.byte_index, kind)
144  }
145
146  pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
147    let range = Range {
148      start,
149      end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
150        self.byte_index + c.len_utf8()
151      } else {
152        self.file_text.len()
153      },
154    };
155    self.create_error_for_range(range, kind)
156  }
157
158  pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
159    ParseError::new(range, kind, self.file_text)
160  }
161
162  fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
163    let quote = self.bytes[self.byte_index];
164    let start = self.byte_index + 1;
165
166    // fast path: scan for closing quote or backslash byte-by-byte.
167    // this is safe because quote (0x22/0x27) and backslash (0x5C) are ASCII
168    // and can never appear as continuation bytes in multi-byte UTF-8 sequences.
169    let mut i = start;
170    while i < self.bytes.len() {
171      let b = self.bytes[i];
172      if b == quote {
173        // found closing quote with no escapes
174        let s = &self.file_text[start..i];
175        self.byte_index = i + 1;
176        return Ok(Token::String(Cow::Borrowed(s)));
177      }
178      if b == b'\\' {
179        break;
180      }
181      i += 1;
182    }
183
184    // slow path: handle escape sequences via CharProvider
185    crate::string::parse_string_with_char_provider(self)
186      .map(Token::String)
187      // todo(dsherret): don't convert the error kind to a string here
188      .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
189  }
190
191  fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
192    let start_byte_index = self.byte_index;
193
194    // handle unary plus and unary minus
195    match self.bytes.get(self.byte_index) {
196      Some(b'+') => {
197        if !self.allow_unary_plus_numbers {
198          return Err(self.create_error_for_current_token(ParseErrorKind::UnaryPlusNumbersNotAllowed));
199        }
200        self.byte_index += 1;
201      }
202      Some(b'-') => {
203        self.byte_index += 1;
204      }
205      _ => {}
206    }
207
208    match self.bytes.get(self.byte_index) {
209      Some(b'0') => {
210        self.byte_index += 1;
211
212        // check for hexadecimal literal (0x or 0X)
213        if matches!(self.bytes.get(self.byte_index), Some(b'x' | b'X')) {
214          if !self.allow_hexadecimal_numbers {
215            return Err(self.create_error_for_current_token(ParseErrorKind::HexadecimalNumbersNotAllowed));
216          }
217
218          self.byte_index += 1;
219
220          // must have at least one hex digit
221          if !matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
222            return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
223          }
224
225          while matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
226            self.byte_index += 1;
227          }
228
229          return Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]));
230        }
231      }
232      Some(b'1'..=b'9') => {
233        self.byte_index += 1;
234        while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
235          self.byte_index += 1;
236        }
237      }
238      _ => {
239        return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
240      }
241    }
242
243    if self.bytes.get(self.byte_index) == Some(&b'.') {
244      self.byte_index += 1;
245
246      if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
247        return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
248      }
249
250      while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
251        self.byte_index += 1;
252      }
253    }
254
255    if matches!(self.bytes.get(self.byte_index), Some(b'e' | b'E')) {
256      self.byte_index += 1;
257
258      match self.bytes.get(self.byte_index) {
259        Some(b'-' | b'+') => {
260          self.byte_index += 1;
261          if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
262            return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
263          }
264        }
265        Some(b'0'..=b'9') => {}
266        _ => {
267          return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
268        }
269      }
270
271      while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
272        self.byte_index += 1;
273      }
274    }
275
276    Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]))
277  }
278
279  fn parse_comment_line(&mut self) -> Token<'a> {
280    debug_assert!(self.bytes[self.byte_index] == b'/');
281    self.byte_index += 1;
282    debug_assert!(self.bytes[self.byte_index] == b'/');
283    let start_byte_index = self.byte_index + 1;
284    self.byte_index += 1;
285
286    // scan byte-by-byte for newline; \n (0x0A) and \r (0x0D) are ASCII
287    // and can never appear as UTF-8 continuation bytes
288    while let Some(&b) = self.bytes.get(self.byte_index) {
289      if b == b'\n' {
290        break;
291      }
292      if b == b'\r' && self.bytes.get(self.byte_index + 1) == Some(&b'\n') {
293        break;
294      }
295      self.byte_index += 1;
296    }
297
298    Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
299  }
300
301  fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
302    debug_assert!(self.bytes[self.byte_index] == b'/');
303    self.byte_index += 1;
304    debug_assert!(self.bytes[self.byte_index] == b'*');
305    let start_byte_index = self.byte_index + 1;
306    self.byte_index += 1;
307
308    // scan byte-by-byte for */; both are ASCII and safe to scan through UTF-8
309    loop {
310      match self.bytes.get(self.byte_index) {
311        Some(&b'*') if self.bytes.get(self.byte_index + 1) == Some(&b'/') => {
312          let end_byte_index = self.byte_index;
313          self.byte_index += 2;
314          return Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]));
315        }
316        Some(_) => self.byte_index += 1,
317        None => return Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock)),
318      }
319    }
320  }
321
322  fn skip_whitespace(&mut self) {
323    while let Some(&b) = self.bytes.get(self.byte_index) {
324      if b <= b' ' {
325        match b {
326          b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C => {
327            self.byte_index += 1;
328            continue;
329          }
330          _ => break,
331        }
332      } else if b >= 0x80 {
333        // handle non-ASCII unicode whitespace
334        let c = self.file_text[self.byte_index..].chars().next().unwrap();
335        if c.is_whitespace() {
336          self.byte_index += c.len_utf8();
337          continue;
338        }
339        break;
340      } else {
341        break;
342      }
343    }
344  }
345
346  fn try_move_word(&mut self, text: &str) -> bool {
347    let text_bytes = text.as_bytes();
348    let end = self.byte_index + text_bytes.len();
349    if end > self.bytes.len() {
350      return false;
351    }
352    if &self.bytes[self.byte_index..end] != text_bytes {
353      return false;
354    }
355    // ensure the word is not followed by an alphanumeric character
356    if let Some(&next_byte) = self.bytes.get(end) {
357      if next_byte.is_ascii_alphanumeric() {
358        return false;
359      }
360      // check non-ASCII alphanumeric
361      if next_byte >= 0x80
362        && let Some(c) = self.file_text[end..].chars().next()
363        && c.is_alphanumeric()
364      {
365        return false;
366      }
367    }
368    self.byte_index = end;
369    true
370  }
371
372  fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
373    let start_byte_index = self.byte_index;
374
375    while self.byte_index < self.bytes.len() {
376      let b = self.bytes[self.byte_index];
377      if b < 0x80 {
378        // ASCII fast path
379        if b.is_ascii_whitespace() || b == b':' {
380          break;
381        }
382        if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
383          self.byte_index += 1;
384        } else {
385          return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
386        }
387      } else {
388        // non-ASCII: decode char
389        let c = self.file_text[self.byte_index..].chars().next().unwrap();
390        if c.is_whitespace() {
391          break;
392        }
393        if c.is_alphanumeric() {
394          self.byte_index += c.len_utf8();
395        } else {
396          return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
397        }
398      }
399    }
400
401    if self.byte_index == start_byte_index {
402      return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
403    }
404
405    Ok(Token::Word(&self.file_text[start_byte_index..self.byte_index]))
406  }
407
408  fn current_char(&self) -> Option<char> {
409    let &b = self.bytes.get(self.byte_index)?;
410    if b < 0x80 {
411      Some(b as char)
412    } else {
413      self.file_text[self.byte_index..].chars().next()
414    }
415  }
416
417  fn move_next_char(&mut self) -> Option<char> {
418    if self.byte_index >= self.bytes.len() {
419      return None;
420    }
421    let b = self.bytes[self.byte_index];
422    if b < 0x80 {
423      self.byte_index += 1;
424    } else {
425      let c = self.file_text[self.byte_index..].chars().next().unwrap();
426      self.byte_index += c.len_utf8();
427    }
428    self.current_char()
429  }
430}
431
432impl<'a> CharProvider<'a> for Scanner<'a> {
433  fn current_char(&mut self) -> Option<char> {
434    Scanner::current_char(self)
435  }
436
437  fn move_next_char(&mut self) -> Option<char> {
438    Scanner::move_next_char(self)
439  }
440
441  fn byte_index(&self) -> usize {
442    self.byte_index
443  }
444
445  fn text(&self) -> &'a str {
446    self.file_text
447  }
448}
449
450#[cfg(test)]
451mod tests {
452  use std::borrow::Cow;
453
454  use super::super::tokens::Token;
455  use super::*;
456  use pretty_assertions::assert_eq;
457
458  #[test]
459  fn it_tokenizes_string() {
460    assert_has_tokens(
461      r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
462      vec![
463        Token::String(Cow::Borrowed(r#"t"est"#)),
464        Token::Comma,
465        Token::String(Cow::Borrowed("\t\r\n\n  test\n other")),
466        Token::Comma,
467      ],
468    );
469  }
470
471  #[test]
472  fn it_errors_escaping_single_quote_in_double_quote() {
473    assert_has_error(
474      r#""t\'est""#,
475      "Invalid escape in double quote string on line 1 column 3",
476    );
477  }
478
479  #[test]
480  fn it_tokenizes_single_quote_string() {
481    assert_has_tokens(
482      r#"'t\'est','a',"#,
483      vec![
484        Token::String(Cow::Borrowed(r#"t'est"#)),
485        Token::Comma,
486        Token::String(Cow::Borrowed("a")),
487        Token::Comma,
488      ],
489    );
490  }
491
492  #[test]
493  fn it_errors_escaping_double_quote_in_single_quote() {
494    assert_has_error(
495      r#"'t\"est'"#,
496      "Invalid escape in single quote string on line 1 column 3",
497    );
498  }
499
500  #[test]
501  fn it_errors_for_word_starting_with_invalid_token() {
502    assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
503  }
504
505  #[test]
506  fn it_tokenizes_numbers() {
507    assert_has_tokens(
508      "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
509      vec![
510        Token::Number("0"),
511        Token::Comma,
512        Token::Number("0.123"),
513        Token::Comma,
514        Token::Number("-198"),
515        Token::Comma,
516        Token::Number("0e-345"),
517        Token::Comma,
518        Token::Number("0.3e+025"),
519        Token::Comma,
520        Token::Number("1e1"),
521        Token::Comma,
522      ],
523    );
524  }
525
526  #[test]
527  fn it_tokenizes_hexadecimal_numbers() {
528    assert_has_tokens(
529      "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
530      vec![
531        Token::Number("0x7DF"),
532        Token::Comma,
533        Token::Number("0xFF"),
534        Token::Comma,
535        Token::Number("0x123ABC"),
536        Token::Comma,
537        Token::Number("0xabc"),
538        Token::Comma,
539        Token::Number("0X1F"),
540      ],
541    );
542  }
543
544  #[test]
545  fn it_tokenizes_unary_plus_numbers() {
546    assert_has_tokens(
547      "+42, +0.5, +1e10, +0xFF",
548      vec![
549        Token::Number("+42"),
550        Token::Comma,
551        Token::Number("+0.5"),
552        Token::Comma,
553        Token::Number("+1e10"),
554        Token::Comma,
555        Token::Number("+0xFF"),
556      ],
557    );
558  }
559
560  #[test]
561  fn it_errors_invalid_exponent() {
562    assert_has_error(
563      r#"1ea"#,
564      "Expected plus, minus, or digit in number literal on line 1 column 3",
565    );
566    assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
567  }
568
569  #[test]
570  fn it_tokenizes_simple_tokens() {
571    assert_has_tokens(
572      "{}[],:true,false,null,",
573      vec![
574        Token::OpenBrace,
575        Token::CloseBrace,
576        Token::OpenBracket,
577        Token::CloseBracket,
578        Token::Comma,
579        Token::Colon,
580        Token::Boolean(true),
581        Token::Comma,
582        Token::Boolean(false),
583        Token::Comma,
584        Token::Null,
585        Token::Comma,
586      ],
587    );
588  }
589
590  #[test]
591  fn it_tokenizes_comment_line() {
592    assert_has_tokens(
593      "//test\n//t\r\n// test\n,",
594      vec![
595        Token::CommentLine("test"),
596        Token::CommentLine("t"),
597        Token::CommentLine(" test"),
598        Token::Comma,
599      ],
600    );
601  }
602
603  #[test]
604  fn it_tokenizes_comment_blocks() {
605    assert_has_tokens(
606      "/*test\n *//* test*/,",
607      vec![
608        Token::CommentBlock("test\n "),
609        Token::CommentBlock(" test"),
610        Token::Comma,
611      ],
612    );
613  }
614
615  #[test]
616  fn it_errors_on_invalid_utf8_char_for_issue_6() {
617    assert_has_error(
618      "\"\\uDF06\"",
619      "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
620    );
621  }
622
623  fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
624    let mut scanner = Scanner::new(text, &Default::default());
625    let mut scanned_tokens = Vec::new();
626
627    loop {
628      match scanner.scan() {
629        Ok(Some(token)) => scanned_tokens.push(token),
630        Ok(None) => break,
631        Err(err) => panic!("Error parsing: {:?}", err),
632      }
633    }
634
635    assert_eq!(scanned_tokens, tokens);
636  }
637
638  fn assert_has_error(text: &str, message: &str) {
639    let mut scanner = Scanner::new(text, &Default::default());
640    let mut error_message = String::new();
641
642    loop {
643      match scanner.scan() {
644        Ok(Some(_)) => {}
645        Ok(None) => break,
646        Err(err) => {
647          error_message = err.to_string();
648          break;
649        }
650      }
651    }
652
653    assert_eq!(error_message, message);
654  }
655}