1use crate::string::CharProvider;
2
3use super::common::Range;
4use super::errors::*;
5use super::tokens::Token;
6use std::str::Chars;
7
8pub struct Scanner<'a> {
10 byte_index: usize,
11 token_start: usize,
12 char_iter: Chars<'a>,
13 char_buffer: Vec<char>,
15 current_token: Option<Token<'a>>,
16 file_text: &'a str,
17 allow_single_quoted_strings: bool,
18 allow_hexadecimal_numbers: bool,
19 allow_unary_plus_numbers: bool,
20}
21
22const CHAR_BUFFER_MAX_SIZE: usize = 6;
23
24#[derive(Debug)]
26pub struct ScannerOptions {
27 pub allow_single_quoted_strings: bool,
29 pub allow_hexadecimal_numbers: bool,
31 pub allow_unary_plus_numbers: bool,
33}
34
35impl Default for ScannerOptions {
36 fn default() -> Self {
37 Self {
38 allow_single_quoted_strings: true,
39 allow_hexadecimal_numbers: true,
40 allow_unary_plus_numbers: true,
41 }
42 }
43}
44
45impl<'a> Scanner<'a> {
46 pub fn new(file_text: &'a str, options: &ScannerOptions) -> Scanner<'a> {
48 let mut char_iter = file_text.chars();
49 let mut char_buffer = Vec::with_capacity(CHAR_BUFFER_MAX_SIZE);
50 let current_char = char_iter.next();
51 if let Some(current_char) = current_char {
52 char_buffer.push(current_char);
53 }
54
55 Scanner {
56 byte_index: 0,
57 token_start: 0,
58 char_iter,
59 char_buffer,
60 current_token: None,
61 file_text,
62 allow_single_quoted_strings: options.allow_single_quoted_strings,
63 allow_hexadecimal_numbers: options.allow_hexadecimal_numbers,
64 allow_unary_plus_numbers: options.allow_unary_plus_numbers,
65 }
66 }
67
68 pub fn file_text(&self) -> &str {
69 self.file_text
70 }
71
72 pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
74 self.skip_whitespace();
75 self.token_start = self.byte_index;
76 if let Some(current_char) = self.current_char() {
77 let token_result = match current_char {
78 '{' => {
79 self.move_next_char();
80 Ok(Token::OpenBrace)
81 }
82 '}' => {
83 self.move_next_char();
84 Ok(Token::CloseBrace)
85 }
86 '[' => {
87 self.move_next_char();
88 Ok(Token::OpenBracket)
89 }
90 ']' => {
91 self.move_next_char();
92 Ok(Token::CloseBracket)
93 }
94 ',' => {
95 self.move_next_char();
96 Ok(Token::Comma)
97 }
98 ':' => {
99 self.move_next_char();
100 Ok(Token::Colon)
101 }
102 '\'' => {
103 if self.allow_single_quoted_strings {
104 self.parse_string()
105 } else {
106 Err(self.create_error_for_current_token(ParseErrorKind::SingleQuotedStringsNotAllowed))
107 }
108 }
109 '"' => self.parse_string(),
110 '/' => match self.peek_char() {
111 Some('/') => Ok(self.parse_comment_line()),
112 Some('*') => self.parse_comment_block(),
113 _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
114 },
115 _ => {
116 if current_char == '-' || current_char == '+' || self.is_digit() {
117 self.parse_number()
118 } else if self.try_move_word("true") {
119 Ok(Token::Boolean(true))
120 } else if self.try_move_word("false") {
121 Ok(Token::Boolean(false))
122 } else if self.try_move_word("null") {
123 Ok(Token::Null)
124 } else {
125 self.parse_word()
126 }
127 }
128 };
129 match token_result {
130 Ok(token) => {
131 self.current_token = Some(token.clone());
132 Ok(Some(token))
133 }
134 Err(err) => Err(err),
135 }
136 } else {
137 self.current_token = None;
138 Ok(None)
139 }
140 }
141
142 pub fn token_start(&self) -> usize {
144 self.token_start
145 }
146
147 pub fn token_end(&self) -> usize {
149 self.byte_index
150 }
151
152 pub fn token(&self) -> Option<Token<'a>> {
154 self.current_token.as_ref().map(|x| x.to_owned())
155 }
156
157 pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
158 self.create_error_for_start(self.token_start, kind)
159 }
160
161 pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
162 self.create_error_for_start(self.byte_index, kind)
163 }
164
165 pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
166 let range = Range {
167 start,
168 end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
169 self.byte_index + c.len_utf8()
170 } else {
171 self.file_text.len()
172 },
173 };
174 self.create_error_for_range(range, kind)
175 }
176
177 pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
178 ParseError::new(range, kind, self.file_text)
179 }
180
181 fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
182 crate::string::parse_string_with_char_provider(self)
183 .map(Token::String)
184 .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
186 }
187
188 fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
189 let start_byte_index = self.byte_index;
190
191 if self.is_positive_sign() {
193 if !self.allow_unary_plus_numbers {
194 return Err(self.create_error_for_current_token(ParseErrorKind::UnaryPlusNumbersNotAllowed));
195 }
196 self.move_next_char();
197 } else if self.is_negative_sign() {
198 self.move_next_char();
199 }
200
201 if self.is_zero() {
202 self.move_next_char();
203
204 if matches!(self.current_char(), Some('x') | Some('X')) {
206 if !self.allow_hexadecimal_numbers {
207 return Err(self.create_error_for_current_token(ParseErrorKind::HexadecimalNumbersNotAllowed));
208 }
209
210 self.move_next_char();
211
212 if !self.is_hex_digit() {
214 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
215 }
216
217 while self.is_hex_digit() {
218 self.move_next_char();
219 }
220
221 let end_byte_index = self.byte_index;
222 return Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]));
223 }
224 } else if self.is_one_nine() {
225 self.move_next_char();
226 while self.is_digit() {
227 self.move_next_char();
228 }
229 } else {
230 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
231 }
232
233 if self.is_decimal_point() {
234 self.move_next_char();
235
236 if !self.is_digit() {
237 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
238 }
239
240 while self.is_digit() {
241 self.move_next_char();
242 }
243 }
244
245 match self.current_char() {
246 Some('e') | Some('E') => {
247 match self.move_next_char() {
248 Some('-') | Some('+') => {
249 self.move_next_char();
250 if !self.is_digit() {
251 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
252 }
253 }
254 _ => {
255 if !self.is_digit() {
256 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
257 }
258 }
259 }
260
261 while self.is_digit() {
262 self.move_next_char();
263 }
264 }
265 _ => {}
266 }
267
268 let end_byte_index = self.byte_index;
269 Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]))
270 }
271
272 fn parse_comment_line(&mut self) -> Token<'a> {
273 self.assert_then_move_char('/');
274 #[cfg(debug_assertions)]
275 self.assert_char('/');
276
277 let start_byte_index = self.byte_index + 1;
278 while self.move_next_char().is_some() {
279 if self.is_new_line() {
280 break;
281 }
282 }
283
284 Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
285 }
286
287 fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
288 self.assert_then_move_char('/');
289 #[cfg(debug_assertions)]
290 self.assert_char('*');
291 let mut found_end = false;
292
293 let start_byte_index = self.byte_index + 1;
294 while let Some(current_char) = self.move_next_char() {
295 if current_char == '*' && self.peek_char() == Some('/') {
296 found_end = true;
297 break;
298 }
299 }
300
301 if found_end {
302 let end_byte_index = self.byte_index;
303 self.assert_then_move_char('*');
304 self.assert_then_move_char('/');
305 Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]))
306 } else {
307 Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock))
308 }
309 }
310
311 fn skip_whitespace(&mut self) {
312 while let Some(current_char) = self.current_char() {
313 if current_char.is_whitespace() {
314 self.move_next_char();
315 } else {
316 break;
317 }
318 }
319 }
320
321 fn try_move_word(&mut self, text: &str) -> bool {
322 let mut char_index = 0;
323 for c in text.chars() {
324 if let Some(current_char) = self.peek_char_offset(char_index) {
325 if current_char != c {
326 return false;
327 }
328
329 char_index += 1;
330 } else {
331 return false;
332 }
333 }
334
335 if let Some(next_char) = self.peek_char_offset(char_index)
336 && next_char.is_alphanumeric()
337 {
338 return false;
339 }
340
341 for _ in 0..char_index {
342 self.move_next_char();
343 }
344
345 true
346 }
347
348 fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
349 let start_byte_index = self.byte_index;
350
351 while let Some(current_char) = self.current_char() {
352 if current_char.is_whitespace() || current_char == ':' {
354 break;
355 }
356 if !current_char.is_alphanumeric() && current_char != '-' && current_char != '_' {
358 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
359 }
360
361 self.move_next_char();
362 }
363
364 let end_byte_index = self.byte_index;
365
366 if end_byte_index - start_byte_index == 0 {
367 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
368 }
369
370 Ok(Token::Word(&self.file_text[start_byte_index..end_byte_index]))
371 }
372
373 fn assert_then_move_char(&mut self, _character: char) {
374 #[cfg(debug_assertions)]
375 self.assert_char(_character);
376
377 self.move_next_char();
378 }
379
380 #[cfg(debug_assertions)]
381 fn assert_char(&mut self, character: char) {
382 let current_char = self.current_char();
383 debug_assert!(
384 current_char == Some(character),
385 "Expected {:?}, was {:?}",
386 character,
387 current_char
388 );
389 }
390
391 fn move_next_char(&mut self) -> Option<char> {
392 if let Some(¤t_char) = self.char_buffer.first() {
393 for i in 1..self.char_buffer.len() {
395 self.char_buffer[i - 1] = self.char_buffer[i];
396 }
397 self.char_buffer.pop();
398
399 if self.char_buffer.is_empty()
400 && let Some(new_char) = self.char_iter.next()
401 {
402 self.char_buffer.push(new_char);
403 }
404
405 self.byte_index += current_char.len_utf8();
406 }
407
408 self.current_char()
409 }
410
411 fn peek_char(&mut self) -> Option<char> {
412 self.peek_char_offset(1)
413 }
414
415 fn peek_char_offset(&mut self, offset: usize) -> Option<char> {
416 for _ in self.char_buffer.len()..offset + 1 {
418 if let Some(next_char) = self.char_iter.next() {
419 self.char_buffer.push(next_char);
420 } else {
421 return None;
423 }
424 }
425
426 debug_assert!(self.char_buffer.len() <= CHAR_BUFFER_MAX_SIZE);
428
429 self.char_buffer.get(offset).copied()
430 }
431
432 fn current_char(&self) -> Option<char> {
433 self.char_buffer.first().copied()
434 }
435
436 fn is_new_line(&mut self) -> bool {
437 match self.current_char() {
438 Some('\n') => true,
439 Some('\r') => self.peek_char() == Some('\n'),
440 _ => false,
441 }
442 }
443
444 fn is_digit(&self) -> bool {
445 self.is_one_nine() || self.is_zero()
446 }
447
448 fn is_hex_digit(&self) -> bool {
449 match self.current_char() {
450 Some(current_char) => current_char.is_ascii_hexdigit(),
451 _ => false,
452 }
453 }
454
455 fn is_zero(&self) -> bool {
456 self.current_char() == Some('0')
457 }
458
459 fn is_one_nine(&self) -> bool {
460 match self.current_char() {
461 Some(current_char) => ('1'..='9').contains(¤t_char),
462 _ => false,
463 }
464 }
465
466 fn is_negative_sign(&self) -> bool {
467 self.current_char() == Some('-')
468 }
469
470 fn is_positive_sign(&self) -> bool {
471 self.current_char() == Some('+')
472 }
473
474 fn is_decimal_point(&self) -> bool {
475 self.current_char() == Some('.')
476 }
477}
478
479impl<'a> CharProvider<'a> for Scanner<'a> {
480 fn current_char(&mut self) -> Option<char> {
481 Scanner::current_char(self)
482 }
483
484 fn move_next_char(&mut self) -> Option<char> {
485 Scanner::move_next_char(self)
486 }
487
488 fn byte_index(&self) -> usize {
489 self.byte_index
490 }
491
492 fn text(&self) -> &'a str {
493 self.file_text
494 }
495}
496
497#[cfg(test)]
498mod tests {
499 use std::borrow::Cow;
500
501 use super::super::tokens::Token;
502 use super::*;
503 use pretty_assertions::assert_eq;
504
505 #[test]
506 fn it_tokenizes_string() {
507 assert_has_tokens(
508 r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
509 vec![
510 Token::String(Cow::Borrowed(r#"t"est"#)),
511 Token::Comma,
512 Token::String(Cow::Borrowed("\t\r\n\n test\n other")),
513 Token::Comma,
514 ],
515 );
516 }
517
518 #[test]
519 fn it_errors_escaping_single_quote_in_double_quote() {
520 assert_has_error(
521 r#""t\'est""#,
522 "Invalid escape in double quote string on line 1 column 3",
523 );
524 }
525
526 #[test]
527 fn it_tokenizes_single_quote_string() {
528 assert_has_tokens(
529 r#"'t\'est','a',"#,
530 vec![
531 Token::String(Cow::Borrowed(r#"t'est"#)),
532 Token::Comma,
533 Token::String(Cow::Borrowed("a")),
534 Token::Comma,
535 ],
536 );
537 }
538
539 #[test]
540 fn it_errors_escaping_double_quote_in_single_quote() {
541 assert_has_error(
542 r#"'t\"est'"#,
543 "Invalid escape in single quote string on line 1 column 3",
544 );
545 }
546
547 #[test]
548 fn it_errors_for_word_starting_with_invalid_token() {
549 assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
550 }
551
552 #[test]
553 fn it_tokenizes_numbers() {
554 assert_has_tokens(
555 "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
556 vec![
557 Token::Number("0"),
558 Token::Comma,
559 Token::Number("0.123"),
560 Token::Comma,
561 Token::Number("-198"),
562 Token::Comma,
563 Token::Number("0e-345"),
564 Token::Comma,
565 Token::Number("0.3e+025"),
566 Token::Comma,
567 Token::Number("1e1"),
568 Token::Comma,
569 ],
570 );
571 }
572
573 #[test]
574 fn it_tokenizes_hexadecimal_numbers() {
575 assert_has_tokens(
576 "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
577 vec![
578 Token::Number("0x7DF"),
579 Token::Comma,
580 Token::Number("0xFF"),
581 Token::Comma,
582 Token::Number("0x123ABC"),
583 Token::Comma,
584 Token::Number("0xabc"),
585 Token::Comma,
586 Token::Number("0X1F"),
587 ],
588 );
589 }
590
591 #[test]
592 fn it_tokenizes_unary_plus_numbers() {
593 assert_has_tokens(
594 "+42, +0.5, +1e10, +0xFF",
595 vec![
596 Token::Number("+42"),
597 Token::Comma,
598 Token::Number("+0.5"),
599 Token::Comma,
600 Token::Number("+1e10"),
601 Token::Comma,
602 Token::Number("+0xFF"),
603 ],
604 );
605 }
606
607 #[test]
608 fn it_errors_invalid_exponent() {
609 assert_has_error(
610 r#"1ea"#,
611 "Expected plus, minus, or digit in number literal on line 1 column 3",
612 );
613 assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
614 }
615
616 #[test]
617 fn it_tokenizes_simple_tokens() {
618 assert_has_tokens(
619 "{}[],:true,false,null,",
620 vec![
621 Token::OpenBrace,
622 Token::CloseBrace,
623 Token::OpenBracket,
624 Token::CloseBracket,
625 Token::Comma,
626 Token::Colon,
627 Token::Boolean(true),
628 Token::Comma,
629 Token::Boolean(false),
630 Token::Comma,
631 Token::Null,
632 Token::Comma,
633 ],
634 );
635 }
636
637 #[test]
638 fn it_tokenizes_comment_line() {
639 assert_has_tokens(
640 "//test\n//t\r\n// test\n,",
641 vec![
642 Token::CommentLine("test"),
643 Token::CommentLine("t"),
644 Token::CommentLine(" test"),
645 Token::Comma,
646 ],
647 );
648 }
649
650 #[test]
651 fn it_tokenizes_comment_blocks() {
652 assert_has_tokens(
653 "/*test\n *//* test*/,",
654 vec![
655 Token::CommentBlock("test\n "),
656 Token::CommentBlock(" test"),
657 Token::Comma,
658 ],
659 );
660 }
661
662 #[test]
663 fn it_errors_on_invalid_utf8_char_for_issue_6() {
664 assert_has_error(
665 "\"\\uDF06\"",
666 "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
667 );
668 }
669
670 fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
671 let mut scanner = Scanner::new(text, &Default::default());
672 let mut scanned_tokens = Vec::new();
673
674 loop {
675 match scanner.scan() {
676 Ok(Some(token)) => scanned_tokens.push(token),
677 Ok(None) => break,
678 Err(err) => panic!("Error parsing: {:?}", err),
679 }
680 }
681
682 assert_eq!(scanned_tokens, tokens);
683 }
684
685 fn assert_has_error(text: &str, message: &str) {
686 let mut scanner = Scanner::new(text, &Default::default());
687 let mut error_message = String::new();
688
689 loop {
690 match scanner.scan() {
691 Ok(Some(_)) => {}
692 Ok(None) => break,
693 Err(err) => {
694 error_message = err.to_string();
695 break;
696 }
697 }
698 }
699
700 assert_eq!(error_message, message);
701 }
702}