jsonc_parser/
scanner.rs

1use crate::string::CharProvider;
2
3use super::common::Range;
4use super::errors::*;
5use super::tokens::Token;
6use std::str::Chars;
7
8/// Converts text into a stream of tokens.
9pub struct Scanner<'a> {
10  byte_index: usize,
11  token_start: usize,
12  char_iter: Chars<'a>,
13  // todo(dsherret): why isn't this a VecDeque?
14  char_buffer: Vec<char>,
15  current_token: Option<Token<'a>>,
16  file_text: &'a str,
17}
18
19const CHAR_BUFFER_MAX_SIZE: usize = 6;
20
21impl<'a> Scanner<'a> {
22  /// Creates a new scanner based on the provided text.
23  pub fn new(file_text: &'a str) -> Scanner<'a> {
24    let mut char_iter = file_text.chars();
25    let mut char_buffer = Vec::with_capacity(CHAR_BUFFER_MAX_SIZE);
26    let current_char = char_iter.next();
27    if let Some(current_char) = current_char {
28      char_buffer.push(current_char);
29    }
30
31    Scanner {
32      byte_index: 0,
33      token_start: 0,
34      char_iter,
35      char_buffer,
36      current_token: None,
37      file_text,
38    }
39  }
40
41  pub fn file_text(&self) -> &str {
42    self.file_text
43  }
44
45  /// Moves to and returns the next token.
46  pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
47    self.skip_whitespace();
48    self.token_start = self.byte_index;
49    if let Some(current_char) = self.current_char() {
50      let token_result = match current_char {
51        '{' => {
52          self.move_next_char();
53          Ok(Token::OpenBrace)
54        }
55        '}' => {
56          self.move_next_char();
57          Ok(Token::CloseBrace)
58        }
59        '[' => {
60          self.move_next_char();
61          Ok(Token::OpenBracket)
62        }
63        ']' => {
64          self.move_next_char();
65          Ok(Token::CloseBracket)
66        }
67        ',' => {
68          self.move_next_char();
69          Ok(Token::Comma)
70        }
71        ':' => {
72          self.move_next_char();
73          Ok(Token::Colon)
74        }
75        '\'' | '"' => self.parse_string(),
76        '/' => match self.peek_char() {
77          Some('/') => Ok(self.parse_comment_line()),
78          Some('*') => self.parse_comment_block(),
79          _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
80        },
81        _ => {
82          if current_char == '-' || current_char == '+' || self.is_digit() {
83            self.parse_number()
84          } else if self.try_move_word("true") {
85            Ok(Token::Boolean(true))
86          } else if self.try_move_word("false") {
87            Ok(Token::Boolean(false))
88          } else if self.try_move_word("null") {
89            Ok(Token::Null)
90          } else {
91            self.parse_word()
92          }
93        }
94      };
95      match token_result {
96        Ok(token) => {
97          self.current_token = Some(token.clone());
98          Ok(Some(token))
99        }
100        Err(err) => Err(err),
101      }
102    } else {
103      self.current_token = None;
104      Ok(None)
105    }
106  }
107
108  /// Gets the start position of the token.
109  pub fn token_start(&self) -> usize {
110    self.token_start
111  }
112
113  /// Gets the end position of the token.
114  pub fn token_end(&self) -> usize {
115    self.byte_index
116  }
117
118  /// Gets the current token.
119  pub fn token(&self) -> Option<Token<'a>> {
120    self.current_token.as_ref().map(|x| x.to_owned())
121  }
122
123  pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
124    self.create_error_for_start(self.token_start, kind)
125  }
126
127  pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
128    self.create_error_for_start(self.byte_index, kind)
129  }
130
131  pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
132    let range = Range {
133      start,
134      end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
135        self.byte_index + c.len_utf8()
136      } else {
137        self.file_text.len()
138      },
139    };
140    self.create_error_for_range(range, kind)
141  }
142
143  pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
144    ParseError::new(range, kind, self.file_text)
145  }
146
147  fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
148    crate::string::parse_string_with_char_provider(self)
149      .map(Token::String)
150      // todo(dsherret): don't convert the error kind to a string here
151      .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
152  }
153
154  fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
155    let start_byte_index = self.byte_index;
156
157    // handle unary plus or minus
158    if self.is_negative_sign() || self.is_positive_sign() {
159      self.move_next_char();
160    }
161
162    if self.is_zero() {
163      self.move_next_char();
164
165      // check for hexadecimal literal (0x or 0X)
166      if matches!(self.current_char(), Some('x') | Some('X')) {
167        self.move_next_char();
168
169        // must have at least one hex digit
170        if !self.is_hex_digit() {
171          return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
172        }
173
174        while self.is_hex_digit() {
175          self.move_next_char();
176        }
177
178        let end_byte_index = self.byte_index;
179        return Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]));
180      }
181    } else if self.is_one_nine() {
182      self.move_next_char();
183      while self.is_digit() {
184        self.move_next_char();
185      }
186    } else {
187      return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
188    }
189
190    if self.is_decimal_point() {
191      self.move_next_char();
192
193      if !self.is_digit() {
194        return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
195      }
196
197      while self.is_digit() {
198        self.move_next_char();
199      }
200    }
201
202    match self.current_char() {
203      Some('e') | Some('E') => {
204        match self.move_next_char() {
205          Some('-') | Some('+') => {
206            self.move_next_char();
207            if !self.is_digit() {
208              return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
209            }
210          }
211          _ => {
212            if !self.is_digit() {
213              return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
214            }
215          }
216        }
217
218        while self.is_digit() {
219          self.move_next_char();
220        }
221      }
222      _ => {}
223    }
224
225    let end_byte_index = self.byte_index;
226    Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]))
227  }
228
229  fn parse_comment_line(&mut self) -> Token<'a> {
230    self.assert_then_move_char('/');
231    #[cfg(debug_assertions)]
232    self.assert_char('/');
233
234    let start_byte_index = self.byte_index + 1;
235    while self.move_next_char().is_some() {
236      if self.is_new_line() {
237        break;
238      }
239    }
240
241    Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
242  }
243
244  fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
245    self.assert_then_move_char('/');
246    #[cfg(debug_assertions)]
247    self.assert_char('*');
248    let mut found_end = false;
249
250    let start_byte_index = self.byte_index + 1;
251    while let Some(current_char) = self.move_next_char() {
252      if current_char == '*' && self.peek_char() == Some('/') {
253        found_end = true;
254        break;
255      }
256    }
257
258    if found_end {
259      let end_byte_index = self.byte_index;
260      self.assert_then_move_char('*');
261      self.assert_then_move_char('/');
262      Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]))
263    } else {
264      Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock))
265    }
266  }
267
268  fn skip_whitespace(&mut self) {
269    while let Some(current_char) = self.current_char() {
270      if current_char.is_whitespace() {
271        self.move_next_char();
272      } else {
273        break;
274      }
275    }
276  }
277
278  fn try_move_word(&mut self, text: &str) -> bool {
279    let mut char_index = 0;
280    for c in text.chars() {
281      if let Some(current_char) = self.peek_char_offset(char_index) {
282        if current_char != c {
283          return false;
284        }
285
286        char_index += 1;
287      } else {
288        return false;
289      }
290    }
291
292    if let Some(next_char) = self.peek_char_offset(char_index)
293      && next_char.is_alphanumeric()
294    {
295      return false;
296    }
297
298    for _ in 0..char_index {
299      self.move_next_char();
300    }
301
302    true
303  }
304
305  fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
306    let start_byte_index = self.byte_index;
307
308    while let Some(current_char) = self.current_char() {
309      // check for word terminators
310      if current_char.is_whitespace() || current_char == ':' {
311        break;
312      }
313      // validate that the character is allowed in a word literal
314      if !current_char.is_alphanumeric() && current_char != '-' && current_char != '_' {
315        return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
316      }
317
318      self.move_next_char();
319    }
320
321    let end_byte_index = self.byte_index;
322
323    if end_byte_index - start_byte_index == 0 {
324      return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
325    }
326
327    Ok(Token::Word(&self.file_text[start_byte_index..end_byte_index]))
328  }
329
330  fn assert_then_move_char(&mut self, _character: char) {
331    #[cfg(debug_assertions)]
332    self.assert_char(_character);
333
334    self.move_next_char();
335  }
336
337  #[cfg(debug_assertions)]
338  fn assert_char(&mut self, character: char) {
339    let current_char = self.current_char();
340    debug_assert!(
341      current_char == Some(character),
342      "Expected {:?}, was {:?}",
343      character,
344      current_char
345    );
346  }
347
348  fn move_next_char(&mut self) -> Option<char> {
349    if let Some(&current_char) = self.char_buffer.first() {
350      // shift the entire array to the left then pop the last item
351      for i in 1..self.char_buffer.len() {
352        self.char_buffer[i - 1] = self.char_buffer[i];
353      }
354      self.char_buffer.pop();
355
356      if self.char_buffer.is_empty()
357        && let Some(new_char) = self.char_iter.next()
358      {
359        self.char_buffer.push(new_char);
360      }
361
362      self.byte_index += current_char.len_utf8();
363    }
364
365    self.current_char()
366  }
367
368  fn peek_char(&mut self) -> Option<char> {
369    self.peek_char_offset(1)
370  }
371
372  fn peek_char_offset(&mut self, offset: usize) -> Option<char> {
373    // fill the char buffer
374    for _ in self.char_buffer.len()..offset + 1 {
375      if let Some(next_char) = self.char_iter.next() {
376        self.char_buffer.push(next_char);
377      } else {
378        // end of string
379        return None;
380      }
381    }
382
383    // should not exceed this
384    debug_assert!(self.char_buffer.len() <= CHAR_BUFFER_MAX_SIZE);
385
386    self.char_buffer.get(offset).copied()
387  }
388
389  fn current_char(&self) -> Option<char> {
390    self.char_buffer.first().copied()
391  }
392
393  fn is_new_line(&mut self) -> bool {
394    match self.current_char() {
395      Some('\n') => true,
396      Some('\r') => self.peek_char() == Some('\n'),
397      _ => false,
398    }
399  }
400
401  fn is_digit(&self) -> bool {
402    self.is_one_nine() || self.is_zero()
403  }
404
405  fn is_hex_digit(&self) -> bool {
406    match self.current_char() {
407      Some(current_char) => current_char.is_ascii_hexdigit(),
408      _ => false,
409    }
410  }
411
412  fn is_zero(&self) -> bool {
413    self.current_char() == Some('0')
414  }
415
416  fn is_one_nine(&self) -> bool {
417    match self.current_char() {
418      Some(current_char) => ('1'..='9').contains(&current_char),
419      _ => false,
420    }
421  }
422
423  fn is_negative_sign(&self) -> bool {
424    self.current_char() == Some('-')
425  }
426
427  fn is_positive_sign(&self) -> bool {
428    self.current_char() == Some('+')
429  }
430
431  fn is_decimal_point(&self) -> bool {
432    self.current_char() == Some('.')
433  }
434}
435
436impl<'a> CharProvider<'a> for Scanner<'a> {
437  fn current_char(&mut self) -> Option<char> {
438    Scanner::current_char(self)
439  }
440
441  fn move_next_char(&mut self) -> Option<char> {
442    Scanner::move_next_char(self)
443  }
444
445  fn byte_index(&self) -> usize {
446    self.byte_index
447  }
448
449  fn text(&self) -> &'a str {
450    self.file_text
451  }
452}
453
454#[cfg(test)]
455mod tests {
456  use std::borrow::Cow;
457
458  use super::super::tokens::Token;
459  use super::*;
460  use pretty_assertions::assert_eq;
461
462  #[test]
463  fn it_tokenizes_string() {
464    assert_has_tokens(
465      r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
466      vec![
467        Token::String(Cow::Borrowed(r#"t"est"#)),
468        Token::Comma,
469        Token::String(Cow::Borrowed("\t\r\n\n  test\n other")),
470        Token::Comma,
471      ],
472    );
473  }
474
475  #[test]
476  fn it_errors_escaping_single_quote_in_double_quote() {
477    assert_has_error(
478      r#""t\'est""#,
479      "Invalid escape in double quote string on line 1 column 3",
480    );
481  }
482
483  #[test]
484  fn it_tokenizes_single_quote_string() {
485    assert_has_tokens(
486      r#"'t\'est','a',"#,
487      vec![
488        Token::String(Cow::Borrowed(r#"t'est"#)),
489        Token::Comma,
490        Token::String(Cow::Borrowed("a")),
491        Token::Comma,
492      ],
493    );
494  }
495
496  #[test]
497  fn it_errors_escaping_double_quote_in_single_quote() {
498    assert_has_error(
499      r#"'t\"est'"#,
500      "Invalid escape in single quote string on line 1 column 3",
501    );
502  }
503
504  #[test]
505  fn it_errors_for_word_starting_with_invalid_token() {
506    assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
507  }
508
509  #[test]
510  fn it_tokenizes_numbers() {
511    assert_has_tokens(
512      "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
513      vec![
514        Token::Number("0"),
515        Token::Comma,
516        Token::Number("0.123"),
517        Token::Comma,
518        Token::Number("-198"),
519        Token::Comma,
520        Token::Number("0e-345"),
521        Token::Comma,
522        Token::Number("0.3e+025"),
523        Token::Comma,
524        Token::Number("1e1"),
525        Token::Comma,
526      ],
527    );
528  }
529
530  #[test]
531  fn it_tokenizes_hexadecimal_numbers() {
532    assert_has_tokens(
533      "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
534      vec![
535        Token::Number("0x7DF"),
536        Token::Comma,
537        Token::Number("0xFF"),
538        Token::Comma,
539        Token::Number("0x123ABC"),
540        Token::Comma,
541        Token::Number("0xabc"),
542        Token::Comma,
543        Token::Number("0X1F"),
544      ],
545    );
546  }
547
548  #[test]
549  fn it_tokenizes_unary_plus_numbers() {
550    assert_has_tokens(
551      "+42, +0.5, +1e10, +0xFF",
552      vec![
553        Token::Number("+42"),
554        Token::Comma,
555        Token::Number("+0.5"),
556        Token::Comma,
557        Token::Number("+1e10"),
558        Token::Comma,
559        Token::Number("+0xFF"),
560      ],
561    );
562  }
563
564  #[test]
565  fn it_errors_invalid_exponent() {
566    assert_has_error(
567      r#"1ea"#,
568      "Expected plus, minus, or digit in number literal on line 1 column 3",
569    );
570    assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
571  }
572
573  #[test]
574  fn it_tokenizes_simple_tokens() {
575    assert_has_tokens(
576      "{}[],:true,false,null,",
577      vec![
578        Token::OpenBrace,
579        Token::CloseBrace,
580        Token::OpenBracket,
581        Token::CloseBracket,
582        Token::Comma,
583        Token::Colon,
584        Token::Boolean(true),
585        Token::Comma,
586        Token::Boolean(false),
587        Token::Comma,
588        Token::Null,
589        Token::Comma,
590      ],
591    );
592  }
593
594  #[test]
595  fn it_tokenizes_comment_line() {
596    assert_has_tokens(
597      "//test\n//t\r\n// test\n,",
598      vec![
599        Token::CommentLine("test"),
600        Token::CommentLine("t"),
601        Token::CommentLine(" test"),
602        Token::Comma,
603      ],
604    );
605  }
606
607  #[test]
608  fn it_tokenizes_comment_blocks() {
609    assert_has_tokens(
610      "/*test\n *//* test*/,",
611      vec![
612        Token::CommentBlock("test\n "),
613        Token::CommentBlock(" test"),
614        Token::Comma,
615      ],
616    );
617  }
618
619  #[test]
620  fn it_errors_on_invalid_utf8_char_for_issue_6() {
621    assert_has_error(
622      "\"\\uDF06\"",
623      "Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",
624    );
625  }
626
627  fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
628    let mut scanner = Scanner::new(text);
629    let mut scanned_tokens = Vec::new();
630
631    loop {
632      match scanner.scan() {
633        Ok(Some(token)) => scanned_tokens.push(token),
634        Ok(None) => break,
635        Err(err) => panic!("Error parsing: {:?}", err),
636      }
637    }
638
639    assert_eq!(scanned_tokens, tokens);
640  }
641
642  fn assert_has_error(text: &str, message: &str) {
643    let mut scanner = Scanner::new(text);
644    let mut error_message = String::new();
645
646    loop {
647      match scanner.scan() {
648        Ok(Some(_)) => {}
649        Ok(None) => break,
650        Err(err) => {
651          error_message = err.to_string();
652          break;
653        }
654      }
655    }
656
657    assert_eq!(error_message, message);
658  }
659}