1use crate::string::CharProvider;
2
3use super::common::Range;
4use super::errors::*;
5use super::tokens::Token;
6use std::str::Chars;
7
8pub struct Scanner<'a> {
10 byte_index: usize,
11 token_start: usize,
12 char_iter: Chars<'a>,
13 char_buffer: Vec<char>,
15 current_token: Option<Token<'a>>,
16 file_text: &'a str,
17}
18
19const CHAR_BUFFER_MAX_SIZE: usize = 6;
20
21impl<'a> Scanner<'a> {
22 pub fn new(file_text: &'a str) -> Scanner<'a> {
24 let mut char_iter = file_text.chars();
25 let mut char_buffer = Vec::with_capacity(CHAR_BUFFER_MAX_SIZE);
26 let current_char = char_iter.next();
27 if let Some(current_char) = current_char {
28 char_buffer.push(current_char);
29 }
30
31 Scanner {
32 byte_index: 0,
33 token_start: 0,
34 char_iter,
35 char_buffer,
36 current_token: None,
37 file_text,
38 }
39 }
40
41 pub fn file_text(&self) -> &str {
42 self.file_text
43 }
44
45 pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
47 self.skip_whitespace();
48 self.token_start = self.byte_index;
49 if let Some(current_char) = self.current_char() {
50 let token_result = match current_char {
51 '{' => {
52 self.move_next_char();
53 Ok(Token::OpenBrace)
54 }
55 '}' => {
56 self.move_next_char();
57 Ok(Token::CloseBrace)
58 }
59 '[' => {
60 self.move_next_char();
61 Ok(Token::OpenBracket)
62 }
63 ']' => {
64 self.move_next_char();
65 Ok(Token::CloseBracket)
66 }
67 ',' => {
68 self.move_next_char();
69 Ok(Token::Comma)
70 }
71 ':' => {
72 self.move_next_char();
73 Ok(Token::Colon)
74 }
75 '\'' | '"' => self.parse_string(),
76 '/' => match self.peek_char() {
77 Some('/') => Ok(self.parse_comment_line()),
78 Some('*') => self.parse_comment_block(),
79 _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
80 },
81 _ => {
82 if current_char == '-' || current_char == '+' || self.is_digit() {
83 self.parse_number()
84 } else if self.try_move_word("true") {
85 Ok(Token::Boolean(true))
86 } else if self.try_move_word("false") {
87 Ok(Token::Boolean(false))
88 } else if self.try_move_word("null") {
89 Ok(Token::Null)
90 } else {
91 self.parse_word()
92 }
93 }
94 };
95 match token_result {
96 Ok(token) => {
97 self.current_token = Some(token.clone());
98 Ok(Some(token))
99 }
100 Err(err) => Err(err),
101 }
102 } else {
103 self.current_token = None;
104 Ok(None)
105 }
106 }
107
108 pub fn token_start(&self) -> usize {
110 self.token_start
111 }
112
113 pub fn token_end(&self) -> usize {
115 self.byte_index
116 }
117
118 pub fn token(&self) -> Option<Token<'a>> {
120 self.current_token.as_ref().map(|x| x.to_owned())
121 }
122
123 pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
124 self.create_error_for_start(self.token_start, kind)
125 }
126
127 pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
128 self.create_error_for_start(self.byte_index, kind)
129 }
130
131 pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
132 let range = Range {
133 start,
134 end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
135 self.byte_index + c.len_utf8()
136 } else {
137 self.file_text.len()
138 },
139 };
140 self.create_error_for_range(range, kind)
141 }
142
143 pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
144 ParseError::new(range, kind, self.file_text)
145 }
146
147 fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
148 crate::string::parse_string_with_char_provider(self)
149 .map(Token::String)
150 .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
152 }
153
154 fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
155 let start_byte_index = self.byte_index;
156
157 if self.is_negative_sign() || self.is_positive_sign() {
159 self.move_next_char();
160 }
161
162 if self.is_zero() {
163 self.move_next_char();
164
165 if matches!(self.current_char(), Some('x') | Some('X')) {
167 self.move_next_char();
168
169 if !self.is_hex_digit() {
171 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
172 }
173
174 while self.is_hex_digit() {
175 self.move_next_char();
176 }
177
178 let end_byte_index = self.byte_index;
179 return Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]));
180 }
181 } else if self.is_one_nine() {
182 self.move_next_char();
183 while self.is_digit() {
184 self.move_next_char();
185 }
186 } else {
187 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
188 }
189
190 if self.is_decimal_point() {
191 self.move_next_char();
192
193 if !self.is_digit() {
194 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
195 }
196
197 while self.is_digit() {
198 self.move_next_char();
199 }
200 }
201
202 match self.current_char() {
203 Some('e') | Some('E') => {
204 match self.move_next_char() {
205 Some('-') | Some('+') => {
206 self.move_next_char();
207 if !self.is_digit() {
208 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
209 }
210 }
211 _ => {
212 if !self.is_digit() {
213 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
214 }
215 }
216 }
217
218 while self.is_digit() {
219 self.move_next_char();
220 }
221 }
222 _ => {}
223 }
224
225 let end_byte_index = self.byte_index;
226 Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]))
227 }
228
229 fn parse_comment_line(&mut self) -> Token<'a> {
230 self.assert_then_move_char('/');
231 #[cfg(debug_assertions)]
232 self.assert_char('/');
233
234 let start_byte_index = self.byte_index + 1;
235 while self.move_next_char().is_some() {
236 if self.is_new_line() {
237 break;
238 }
239 }
240
241 Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
242 }
243
244 fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
245 self.assert_then_move_char('/');
246 #[cfg(debug_assertions)]
247 self.assert_char('*');
248 let mut found_end = false;
249
250 let start_byte_index = self.byte_index + 1;
251 while let Some(current_char) = self.move_next_char() {
252 if current_char == '*' && self.peek_char() == Some('/') {
253 found_end = true;
254 break;
255 }
256 }
257
258 if found_end {
259 let end_byte_index = self.byte_index;
260 self.assert_then_move_char('*');
261 self.assert_then_move_char('/');
262 Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]))
263 } else {
264 Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock))
265 }
266 }
267
268 fn skip_whitespace(&mut self) {
269 while let Some(current_char) = self.current_char() {
270 if current_char.is_whitespace() {
271 self.move_next_char();
272 } else {
273 break;
274 }
275 }
276 }
277
278 fn try_move_word(&mut self, text: &str) -> bool {
279 let mut char_index = 0;
280 for c in text.chars() {
281 if let Some(current_char) = self.peek_char_offset(char_index) {
282 if current_char != c {
283 return false;
284 }
285
286 char_index += 1;
287 } else {
288 return false;
289 }
290 }
291
292 if let Some(next_char) = self.peek_char_offset(char_index)
293 && next_char.is_alphanumeric()
294 {
295 return false;
296 }
297
298 for _ in 0..char_index {
299 self.move_next_char();
300 }
301
302 true
303 }
304
305 fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
306 let start_byte_index = self.byte_index;
307
308 while let Some(current_char) = self.current_char() {
309 if current_char.is_whitespace() || current_char == ':' {
311 break;
312 }
313 if !current_char.is_alphanumeric() && current_char != '-' && current_char != '_' {
315 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
316 }
317
318 self.move_next_char();
319 }
320
321 let end_byte_index = self.byte_index;
322
323 if end_byte_index - start_byte_index == 0 {
324 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
325 }
326
327 Ok(Token::Word(&self.file_text[start_byte_index..end_byte_index]))
328 }
329
330 fn assert_then_move_char(&mut self, _character: char) {
331 #[cfg(debug_assertions)]
332 self.assert_char(_character);
333
334 self.move_next_char();
335 }
336
337 #[cfg(debug_assertions)]
338 fn assert_char(&mut self, character: char) {
339 let current_char = self.current_char();
340 debug_assert!(
341 current_char == Some(character),
342 "Expected {:?}, was {:?}",
343 character,
344 current_char
345 );
346 }
347
348 fn move_next_char(&mut self) -> Option<char> {
349 if let Some(¤t_char) = self.char_buffer.first() {
350 for i in 1..self.char_buffer.len() {
352 self.char_buffer[i - 1] = self.char_buffer[i];
353 }
354 self.char_buffer.pop();
355
356 if self.char_buffer.is_empty()
357 && let Some(new_char) = self.char_iter.next()
358 {
359 self.char_buffer.push(new_char);
360 }
361
362 self.byte_index += current_char.len_utf8();
363 }
364
365 self.current_char()
366 }
367
368 fn peek_char(&mut self) -> Option<char> {
369 self.peek_char_offset(1)
370 }
371
372 fn peek_char_offset(&mut self, offset: usize) -> Option<char> {
373 for _ in self.char_buffer.len()..offset + 1 {
375 if let Some(next_char) = self.char_iter.next() {
376 self.char_buffer.push(next_char);
377 } else {
378 return None;
380 }
381 }
382
383 debug_assert!(self.char_buffer.len() <= CHAR_BUFFER_MAX_SIZE);
385
386 self.char_buffer.get(offset).copied()
387 }
388
389 fn current_char(&self) -> Option<char> {
390 self.char_buffer.first().copied()
391 }
392
393 fn is_new_line(&mut self) -> bool {
394 match self.current_char() {
395 Some('\n') => true,
396 Some('\r') => self.peek_char() == Some('\n'),
397 _ => false,
398 }
399 }
400
401 fn is_digit(&self) -> bool {
402 self.is_one_nine() || self.is_zero()
403 }
404
405 fn is_hex_digit(&self) -> bool {
406 match self.current_char() {
407 Some(current_char) => current_char.is_ascii_hexdigit(),
408 _ => false,
409 }
410 }
411
412 fn is_zero(&self) -> bool {
413 self.current_char() == Some('0')
414 }
415
416 fn is_one_nine(&self) -> bool {
417 match self.current_char() {
418 Some(current_char) => ('1'..='9').contains(¤t_char),
419 _ => false,
420 }
421 }
422
423 fn is_negative_sign(&self) -> bool {
424 self.current_char() == Some('-')
425 }
426
427 fn is_positive_sign(&self) -> bool {
428 self.current_char() == Some('+')
429 }
430
431 fn is_decimal_point(&self) -> bool {
432 self.current_char() == Some('.')
433 }
434}
435
436impl<'a> CharProvider<'a> for Scanner<'a> {
437 fn current_char(&mut self) -> Option<char> {
438 Scanner::current_char(self)
439 }
440
441 fn move_next_char(&mut self) -> Option<char> {
442 Scanner::move_next_char(self)
443 }
444
445 fn byte_index(&self) -> usize {
446 self.byte_index
447 }
448
449 fn text(&self) -> &'a str {
450 self.file_text
451 }
452}
453
454#[cfg(test)]
455mod tests {
456 use std::borrow::Cow;
457
458 use super::super::tokens::Token;
459 use super::*;
460 use pretty_assertions::assert_eq;
461
462 #[test]
463 fn it_tokenizes_string() {
464 assert_has_tokens(
465 r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
466 vec![
467 Token::String(Cow::Borrowed(r#"t"est"#)),
468 Token::Comma,
469 Token::String(Cow::Borrowed("\t\r\n\n test\n other")),
470 Token::Comma,
471 ],
472 );
473 }
474
475 #[test]
476 fn it_errors_escaping_single_quote_in_double_quote() {
477 assert_has_error(
478 r#""t\'est""#,
479 "Invalid escape in double quote string on line 1 column 3",
480 );
481 }
482
483 #[test]
484 fn it_tokenizes_single_quote_string() {
485 assert_has_tokens(
486 r#"'t\'est','a',"#,
487 vec![
488 Token::String(Cow::Borrowed(r#"t'est"#)),
489 Token::Comma,
490 Token::String(Cow::Borrowed("a")),
491 Token::Comma,
492 ],
493 );
494 }
495
496 #[test]
497 fn it_errors_escaping_double_quote_in_single_quote() {
498 assert_has_error(
499 r#"'t\"est'"#,
500 "Invalid escape in single quote string on line 1 column 3",
501 );
502 }
503
504 #[test]
505 fn it_errors_for_word_starting_with_invalid_token() {
506 assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
507 }
508
509 #[test]
510 fn it_tokenizes_numbers() {
511 assert_has_tokens(
512 "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
513 vec![
514 Token::Number("0"),
515 Token::Comma,
516 Token::Number("0.123"),
517 Token::Comma,
518 Token::Number("-198"),
519 Token::Comma,
520 Token::Number("0e-345"),
521 Token::Comma,
522 Token::Number("0.3e+025"),
523 Token::Comma,
524 Token::Number("1e1"),
525 Token::Comma,
526 ],
527 );
528 }
529
530 #[test]
531 fn it_tokenizes_hexadecimal_numbers() {
532 assert_has_tokens(
533 "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
534 vec![
535 Token::Number("0x7DF"),
536 Token::Comma,
537 Token::Number("0xFF"),
538 Token::Comma,
539 Token::Number("0x123ABC"),
540 Token::Comma,
541 Token::Number("0xabc"),
542 Token::Comma,
543 Token::Number("0X1F"),
544 ],
545 );
546 }
547
548 #[test]
549 fn it_tokenizes_unary_plus_numbers() {
550 assert_has_tokens(
551 "+42, +0.5, +1e10, +0xFF",
552 vec![
553 Token::Number("+42"),
554 Token::Comma,
555 Token::Number("+0.5"),
556 Token::Comma,
557 Token::Number("+1e10"),
558 Token::Comma,
559 Token::Number("+0xFF"),
560 ],
561 );
562 }
563
564 #[test]
565 fn it_errors_invalid_exponent() {
566 assert_has_error(
567 r#"1ea"#,
568 "Expected plus, minus, or digit in number literal on line 1 column 3",
569 );
570 assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
571 }
572
573 #[test]
574 fn it_tokenizes_simple_tokens() {
575 assert_has_tokens(
576 "{}[],:true,false,null,",
577 vec![
578 Token::OpenBrace,
579 Token::CloseBrace,
580 Token::OpenBracket,
581 Token::CloseBracket,
582 Token::Comma,
583 Token::Colon,
584 Token::Boolean(true),
585 Token::Comma,
586 Token::Boolean(false),
587 Token::Comma,
588 Token::Null,
589 Token::Comma,
590 ],
591 );
592 }
593
594 #[test]
595 fn it_tokenizes_comment_line() {
596 assert_has_tokens(
597 "//test\n//t\r\n// test\n,",
598 vec![
599 Token::CommentLine("test"),
600 Token::CommentLine("t"),
601 Token::CommentLine(" test"),
602 Token::Comma,
603 ],
604 );
605 }
606
607 #[test]
608 fn it_tokenizes_comment_blocks() {
609 assert_has_tokens(
610 "/*test\n *//* test*/,",
611 vec![
612 Token::CommentBlock("test\n "),
613 Token::CommentBlock(" test"),
614 Token::Comma,
615 ],
616 );
617 }
618
619 #[test]
620 fn it_errors_on_invalid_utf8_char_for_issue_6() {
621 assert_has_error(
622 "\"\\uDF06\"",
623 "Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",
624 );
625 }
626
627 fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
628 let mut scanner = Scanner::new(text);
629 let mut scanned_tokens = Vec::new();
630
631 loop {
632 match scanner.scan() {
633 Ok(Some(token)) => scanned_tokens.push(token),
634 Ok(None) => break,
635 Err(err) => panic!("Error parsing: {:?}", err),
636 }
637 }
638
639 assert_eq!(scanned_tokens, tokens);
640 }
641
642 fn assert_has_error(text: &str, message: &str) {
643 let mut scanner = Scanner::new(text);
644 let mut error_message = String::new();
645
646 loop {
647 match scanner.scan() {
648 Ok(Some(_)) => {}
649 Ok(None) => break,
650 Err(err) => {
651 error_message = err.to_string();
652 break;
653 }
654 }
655 }
656
657 assert_eq!(error_message, message);
658 }
659}