jsonc_parser/
scanner.rs

1use crate::string::CharProvider;
2
3use super::common::Range;
4use super::errors::*;
5use super::tokens::Token;
6use std::str::Chars;
7
8/// Converts text into a stream of tokens.
9pub struct Scanner<'a> {
10  byte_index: usize,
11  token_start: usize,
12  char_iter: Chars<'a>,
13  // todo(dsherret): why isn't this a VecDeque?
14  char_buffer: Vec<char>,
15  current_token: Option<Token<'a>>,
16  file_text: &'a str,
17  allow_single_quoted_strings: bool,
18  allow_hexadecimal_numbers: bool,
19  allow_unary_plus_numbers: bool,
20}
21
22const CHAR_BUFFER_MAX_SIZE: usize = 6;
23
24/// Options for the scanner.
25#[derive(Debug)]
26pub struct ScannerOptions {
27  /// Allow single-quoted strings (defaults to `true`).
28  pub allow_single_quoted_strings: bool,
29  /// Allow hexadecimal numbers like 0xFF (defaults to `true`).
30  pub allow_hexadecimal_numbers: bool,
31  /// Allow unary plus sign on numbers like +42 (defaults to `true`).
32  pub allow_unary_plus_numbers: bool,
33}
34
35impl Default for ScannerOptions {
36  fn default() -> Self {
37    Self {
38      allow_single_quoted_strings: true,
39      allow_hexadecimal_numbers: true,
40      allow_unary_plus_numbers: true,
41    }
42  }
43}
44
45impl<'a> Scanner<'a> {
46  /// Creates a new scanner with specific options.
47  pub fn new(file_text: &'a str, options: &ScannerOptions) -> Scanner<'a> {
48    let mut char_iter = file_text.chars();
49    let mut char_buffer = Vec::with_capacity(CHAR_BUFFER_MAX_SIZE);
50    let current_char = char_iter.next();
51    if let Some(current_char) = current_char {
52      char_buffer.push(current_char);
53    }
54
55    Scanner {
56      byte_index: 0,
57      token_start: 0,
58      char_iter,
59      char_buffer,
60      current_token: None,
61      file_text,
62      allow_single_quoted_strings: options.allow_single_quoted_strings,
63      allow_hexadecimal_numbers: options.allow_hexadecimal_numbers,
64      allow_unary_plus_numbers: options.allow_unary_plus_numbers,
65    }
66  }
67
68  pub fn file_text(&self) -> &str {
69    self.file_text
70  }
71
72  /// Moves to and returns the next token.
73  pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
74    self.skip_whitespace();
75    self.token_start = self.byte_index;
76    if let Some(current_char) = self.current_char() {
77      let token_result = match current_char {
78        '{' => {
79          self.move_next_char();
80          Ok(Token::OpenBrace)
81        }
82        '}' => {
83          self.move_next_char();
84          Ok(Token::CloseBrace)
85        }
86        '[' => {
87          self.move_next_char();
88          Ok(Token::OpenBracket)
89        }
90        ']' => {
91          self.move_next_char();
92          Ok(Token::CloseBracket)
93        }
94        ',' => {
95          self.move_next_char();
96          Ok(Token::Comma)
97        }
98        ':' => {
99          self.move_next_char();
100          Ok(Token::Colon)
101        }
102        '\'' => {
103          if self.allow_single_quoted_strings {
104            self.parse_string()
105          } else {
106            Err(self.create_error_for_current_token(ParseErrorKind::SingleQuotedStringsNotAllowed))
107          }
108        }
109        '"' => self.parse_string(),
110        '/' => match self.peek_char() {
111          Some('/') => Ok(self.parse_comment_line()),
112          Some('*') => self.parse_comment_block(),
113          _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
114        },
115        _ => {
116          if current_char == '-' || current_char == '+' || self.is_digit() {
117            self.parse_number()
118          } else if self.try_move_word("true") {
119            Ok(Token::Boolean(true))
120          } else if self.try_move_word("false") {
121            Ok(Token::Boolean(false))
122          } else if self.try_move_word("null") {
123            Ok(Token::Null)
124          } else {
125            self.parse_word()
126          }
127        }
128      };
129      match token_result {
130        Ok(token) => {
131          self.current_token = Some(token.clone());
132          Ok(Some(token))
133        }
134        Err(err) => Err(err),
135      }
136    } else {
137      self.current_token = None;
138      Ok(None)
139    }
140  }
141
142  /// Gets the start position of the token.
143  pub fn token_start(&self) -> usize {
144    self.token_start
145  }
146
147  /// Gets the end position of the token.
148  pub fn token_end(&self) -> usize {
149    self.byte_index
150  }
151
152  /// Gets the current token.
153  pub fn token(&self) -> Option<Token<'a>> {
154    self.current_token.as_ref().map(|x| x.to_owned())
155  }
156
157  pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
158    self.create_error_for_start(self.token_start, kind)
159  }
160
161  pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
162    self.create_error_for_start(self.byte_index, kind)
163  }
164
165  pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
166    let range = Range {
167      start,
168      end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
169        self.byte_index + c.len_utf8()
170      } else {
171        self.file_text.len()
172      },
173    };
174    self.create_error_for_range(range, kind)
175  }
176
177  pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
178    ParseError::new(range, kind, self.file_text)
179  }
180
181  fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
182    crate::string::parse_string_with_char_provider(self)
183      .map(Token::String)
184      // todo(dsherret): don't convert the error kind to a string here
185      .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
186  }
187
188  fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
189    let start_byte_index = self.byte_index;
190
191    // handle unary plus and unary minus
192    if self.is_positive_sign() {
193      if !self.allow_unary_plus_numbers {
194        return Err(self.create_error_for_current_token(ParseErrorKind::UnaryPlusNumbersNotAllowed));
195      }
196      self.move_next_char();
197    } else if self.is_negative_sign() {
198      self.move_next_char();
199    }
200
201    if self.is_zero() {
202      self.move_next_char();
203
204      // check for hexadecimal literal (0x or 0X)
205      if matches!(self.current_char(), Some('x') | Some('X')) {
206        if !self.allow_hexadecimal_numbers {
207          return Err(self.create_error_for_current_token(ParseErrorKind::HexadecimalNumbersNotAllowed));
208        }
209
210        self.move_next_char();
211
212        // must have at least one hex digit
213        if !self.is_hex_digit() {
214          return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
215        }
216
217        while self.is_hex_digit() {
218          self.move_next_char();
219        }
220
221        let end_byte_index = self.byte_index;
222        return Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]));
223      }
224    } else if self.is_one_nine() {
225      self.move_next_char();
226      while self.is_digit() {
227        self.move_next_char();
228      }
229    } else {
230      return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
231    }
232
233    if self.is_decimal_point() {
234      self.move_next_char();
235
236      if !self.is_digit() {
237        return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
238      }
239
240      while self.is_digit() {
241        self.move_next_char();
242      }
243    }
244
245    match self.current_char() {
246      Some('e') | Some('E') => {
247        match self.move_next_char() {
248          Some('-') | Some('+') => {
249            self.move_next_char();
250            if !self.is_digit() {
251              return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
252            }
253          }
254          _ => {
255            if !self.is_digit() {
256              return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
257            }
258          }
259        }
260
261        while self.is_digit() {
262          self.move_next_char();
263        }
264      }
265      _ => {}
266    }
267
268    let end_byte_index = self.byte_index;
269    Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]))
270  }
271
272  fn parse_comment_line(&mut self) -> Token<'a> {
273    self.assert_then_move_char('/');
274    #[cfg(debug_assertions)]
275    self.assert_char('/');
276
277    let start_byte_index = self.byte_index + 1;
278    while self.move_next_char().is_some() {
279      if self.is_new_line() {
280        break;
281      }
282    }
283
284    Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
285  }
286
287  fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
288    self.assert_then_move_char('/');
289    #[cfg(debug_assertions)]
290    self.assert_char('*');
291    let mut found_end = false;
292
293    let start_byte_index = self.byte_index + 1;
294    while let Some(current_char) = self.move_next_char() {
295      if current_char == '*' && self.peek_char() == Some('/') {
296        found_end = true;
297        break;
298      }
299    }
300
301    if found_end {
302      let end_byte_index = self.byte_index;
303      self.assert_then_move_char('*');
304      self.assert_then_move_char('/');
305      Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]))
306    } else {
307      Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock))
308    }
309  }
310
311  fn skip_whitespace(&mut self) {
312    while let Some(current_char) = self.current_char() {
313      if current_char.is_whitespace() {
314        self.move_next_char();
315      } else {
316        break;
317      }
318    }
319  }
320
321  fn try_move_word(&mut self, text: &str) -> bool {
322    let mut char_index = 0;
323    for c in text.chars() {
324      if let Some(current_char) = self.peek_char_offset(char_index) {
325        if current_char != c {
326          return false;
327        }
328
329        char_index += 1;
330      } else {
331        return false;
332      }
333    }
334
335    if let Some(next_char) = self.peek_char_offset(char_index)
336      && next_char.is_alphanumeric()
337    {
338      return false;
339    }
340
341    for _ in 0..char_index {
342      self.move_next_char();
343    }
344
345    true
346  }
347
348  fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
349    let start_byte_index = self.byte_index;
350
351    while let Some(current_char) = self.current_char() {
352      // check for word terminators
353      if current_char.is_whitespace() || current_char == ':' {
354        break;
355      }
356      // validate that the character is allowed in a word literal
357      if !current_char.is_alphanumeric() && current_char != '-' && current_char != '_' {
358        return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
359      }
360
361      self.move_next_char();
362    }
363
364    let end_byte_index = self.byte_index;
365
366    if end_byte_index - start_byte_index == 0 {
367      return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
368    }
369
370    Ok(Token::Word(&self.file_text[start_byte_index..end_byte_index]))
371  }
372
373  fn assert_then_move_char(&mut self, _character: char) {
374    #[cfg(debug_assertions)]
375    self.assert_char(_character);
376
377    self.move_next_char();
378  }
379
380  #[cfg(debug_assertions)]
381  fn assert_char(&mut self, character: char) {
382    let current_char = self.current_char();
383    debug_assert!(
384      current_char == Some(character),
385      "Expected {:?}, was {:?}",
386      character,
387      current_char
388    );
389  }
390
391  fn move_next_char(&mut self) -> Option<char> {
392    if let Some(&current_char) = self.char_buffer.first() {
393      // shift the entire array to the left then pop the last item
394      for i in 1..self.char_buffer.len() {
395        self.char_buffer[i - 1] = self.char_buffer[i];
396      }
397      self.char_buffer.pop();
398
399      if self.char_buffer.is_empty()
400        && let Some(new_char) = self.char_iter.next()
401      {
402        self.char_buffer.push(new_char);
403      }
404
405      self.byte_index += current_char.len_utf8();
406    }
407
408    self.current_char()
409  }
410
411  fn peek_char(&mut self) -> Option<char> {
412    self.peek_char_offset(1)
413  }
414
415  fn peek_char_offset(&mut self, offset: usize) -> Option<char> {
416    // fill the char buffer
417    for _ in self.char_buffer.len()..offset + 1 {
418      if let Some(next_char) = self.char_iter.next() {
419        self.char_buffer.push(next_char);
420      } else {
421        // end of string
422        return None;
423      }
424    }
425
426    // should not exceed this
427    debug_assert!(self.char_buffer.len() <= CHAR_BUFFER_MAX_SIZE);
428
429    self.char_buffer.get(offset).copied()
430  }
431
432  fn current_char(&self) -> Option<char> {
433    self.char_buffer.first().copied()
434  }
435
436  fn is_new_line(&mut self) -> bool {
437    match self.current_char() {
438      Some('\n') => true,
439      Some('\r') => self.peek_char() == Some('\n'),
440      _ => false,
441    }
442  }
443
444  fn is_digit(&self) -> bool {
445    self.is_one_nine() || self.is_zero()
446  }
447
448  fn is_hex_digit(&self) -> bool {
449    match self.current_char() {
450      Some(current_char) => current_char.is_ascii_hexdigit(),
451      _ => false,
452    }
453  }
454
455  fn is_zero(&self) -> bool {
456    self.current_char() == Some('0')
457  }
458
459  fn is_one_nine(&self) -> bool {
460    match self.current_char() {
461      Some(current_char) => ('1'..='9').contains(&current_char),
462      _ => false,
463    }
464  }
465
466  fn is_negative_sign(&self) -> bool {
467    self.current_char() == Some('-')
468  }
469
470  fn is_positive_sign(&self) -> bool {
471    self.current_char() == Some('+')
472  }
473
474  fn is_decimal_point(&self) -> bool {
475    self.current_char() == Some('.')
476  }
477}
478
479impl<'a> CharProvider<'a> for Scanner<'a> {
480  fn current_char(&mut self) -> Option<char> {
481    Scanner::current_char(self)
482  }
483
484  fn move_next_char(&mut self) -> Option<char> {
485    Scanner::move_next_char(self)
486  }
487
488  fn byte_index(&self) -> usize {
489    self.byte_index
490  }
491
492  fn text(&self) -> &'a str {
493    self.file_text
494  }
495}
496
497#[cfg(test)]
498mod tests {
499  use std::borrow::Cow;
500
501  use super::super::tokens::Token;
502  use super::*;
503  use pretty_assertions::assert_eq;
504
505  #[test]
506  fn it_tokenizes_string() {
507    assert_has_tokens(
508      r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
509      vec![
510        Token::String(Cow::Borrowed(r#"t"est"#)),
511        Token::Comma,
512        Token::String(Cow::Borrowed("\t\r\n\n  test\n other")),
513        Token::Comma,
514      ],
515    );
516  }
517
518  #[test]
519  fn it_errors_escaping_single_quote_in_double_quote() {
520    assert_has_error(
521      r#""t\'est""#,
522      "Invalid escape in double quote string on line 1 column 3",
523    );
524  }
525
526  #[test]
527  fn it_tokenizes_single_quote_string() {
528    assert_has_tokens(
529      r#"'t\'est','a',"#,
530      vec![
531        Token::String(Cow::Borrowed(r#"t'est"#)),
532        Token::Comma,
533        Token::String(Cow::Borrowed("a")),
534        Token::Comma,
535      ],
536    );
537  }
538
539  #[test]
540  fn it_errors_escaping_double_quote_in_single_quote() {
541    assert_has_error(
542      r#"'t\"est'"#,
543      "Invalid escape in single quote string on line 1 column 3",
544    );
545  }
546
547  #[test]
548  fn it_errors_for_word_starting_with_invalid_token() {
549    assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
550  }
551
552  #[test]
553  fn it_tokenizes_numbers() {
554    assert_has_tokens(
555      "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
556      vec![
557        Token::Number("0"),
558        Token::Comma,
559        Token::Number("0.123"),
560        Token::Comma,
561        Token::Number("-198"),
562        Token::Comma,
563        Token::Number("0e-345"),
564        Token::Comma,
565        Token::Number("0.3e+025"),
566        Token::Comma,
567        Token::Number("1e1"),
568        Token::Comma,
569      ],
570    );
571  }
572
573  #[test]
574  fn it_tokenizes_hexadecimal_numbers() {
575    assert_has_tokens(
576      "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
577      vec![
578        Token::Number("0x7DF"),
579        Token::Comma,
580        Token::Number("0xFF"),
581        Token::Comma,
582        Token::Number("0x123ABC"),
583        Token::Comma,
584        Token::Number("0xabc"),
585        Token::Comma,
586        Token::Number("0X1F"),
587      ],
588    );
589  }
590
591  #[test]
592  fn it_tokenizes_unary_plus_numbers() {
593    assert_has_tokens(
594      "+42, +0.5, +1e10, +0xFF",
595      vec![
596        Token::Number("+42"),
597        Token::Comma,
598        Token::Number("+0.5"),
599        Token::Comma,
600        Token::Number("+1e10"),
601        Token::Comma,
602        Token::Number("+0xFF"),
603      ],
604    );
605  }
606
607  #[test]
608  fn it_errors_invalid_exponent() {
609    assert_has_error(
610      r#"1ea"#,
611      "Expected plus, minus, or digit in number literal on line 1 column 3",
612    );
613    assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
614  }
615
616  #[test]
617  fn it_tokenizes_simple_tokens() {
618    assert_has_tokens(
619      "{}[],:true,false,null,",
620      vec![
621        Token::OpenBrace,
622        Token::CloseBrace,
623        Token::OpenBracket,
624        Token::CloseBracket,
625        Token::Comma,
626        Token::Colon,
627        Token::Boolean(true),
628        Token::Comma,
629        Token::Boolean(false),
630        Token::Comma,
631        Token::Null,
632        Token::Comma,
633      ],
634    );
635  }
636
637  #[test]
638  fn it_tokenizes_comment_line() {
639    assert_has_tokens(
640      "//test\n//t\r\n// test\n,",
641      vec![
642        Token::CommentLine("test"),
643        Token::CommentLine("t"),
644        Token::CommentLine(" test"),
645        Token::Comma,
646      ],
647    );
648  }
649
650  #[test]
651  fn it_tokenizes_comment_blocks() {
652    assert_has_tokens(
653      "/*test\n *//* test*/,",
654      vec![
655        Token::CommentBlock("test\n "),
656        Token::CommentBlock(" test"),
657        Token::Comma,
658      ],
659    );
660  }
661
662  #[test]
663  fn it_errors_on_invalid_utf8_char_for_issue_6() {
664    assert_has_error(
665      "\"\\uDF06\"",
666      "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
667    );
668  }
669
670  fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
671    let mut scanner = Scanner::new(text, &Default::default());
672    let mut scanned_tokens = Vec::new();
673
674    loop {
675      match scanner.scan() {
676        Ok(Some(token)) => scanned_tokens.push(token),
677        Ok(None) => break,
678        Err(err) => panic!("Error parsing: {:?}", err),
679      }
680    }
681
682    assert_eq!(scanned_tokens, tokens);
683  }
684
685  fn assert_has_error(text: &str, message: &str) {
686    let mut scanner = Scanner::new(text, &Default::default());
687    let mut error_message = String::new();
688
689    loop {
690      match scanner.scan() {
691        Ok(Some(_)) => {}
692        Ok(None) => break,
693        Err(err) => {
694          error_message = err.to_string();
695          break;
696        }
697      }
698    }
699
700    assert_eq!(error_message, message);
701  }
702}