1use std::borrow::Cow;
2
3use crate::string::CharProvider;
4
5use super::common::Range;
6use super::errors::*;
7use super::tokens::Token;
8
9pub struct Scanner<'a> {
11 byte_index: usize,
12 token_start: usize,
13 bytes: &'a [u8],
14 current_token: Option<Token<'a>>,
15 file_text: &'a str,
16 allow_single_quoted_strings: bool,
17 allow_hexadecimal_numbers: bool,
18 allow_unary_plus_numbers: bool,
19}
20
21#[derive(Debug)]
23pub struct ScannerOptions {
24 pub allow_single_quoted_strings: bool,
26 pub allow_hexadecimal_numbers: bool,
28 pub allow_unary_plus_numbers: bool,
30}
31
32impl Default for ScannerOptions {
33 fn default() -> Self {
34 Self {
35 allow_single_quoted_strings: true,
36 allow_hexadecimal_numbers: true,
37 allow_unary_plus_numbers: true,
38 }
39 }
40}
41
42impl<'a> Scanner<'a> {
43 pub fn new(file_text: &'a str, options: &ScannerOptions) -> Scanner<'a> {
45 Scanner {
46 byte_index: 0,
47 token_start: 0,
48 bytes: file_text.as_bytes(),
49 current_token: None,
50 file_text,
51 allow_single_quoted_strings: options.allow_single_quoted_strings,
52 allow_hexadecimal_numbers: options.allow_hexadecimal_numbers,
53 allow_unary_plus_numbers: options.allow_unary_plus_numbers,
54 }
55 }
56
57 pub fn file_text(&self) -> &str {
58 self.file_text
59 }
60
61 pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
63 self.skip_whitespace();
64 self.token_start = self.byte_index;
65 if let Some(&b) = self.bytes.get(self.byte_index) {
66 let token_result = match b {
67 b'{' => {
68 self.byte_index += 1;
69 Ok(Token::OpenBrace)
70 }
71 b'}' => {
72 self.byte_index += 1;
73 Ok(Token::CloseBrace)
74 }
75 b'[' => {
76 self.byte_index += 1;
77 Ok(Token::OpenBracket)
78 }
79 b']' => {
80 self.byte_index += 1;
81 Ok(Token::CloseBracket)
82 }
83 b',' => {
84 self.byte_index += 1;
85 Ok(Token::Comma)
86 }
87 b':' => {
88 self.byte_index += 1;
89 Ok(Token::Colon)
90 }
91 b'\'' => {
92 if self.allow_single_quoted_strings {
93 self.parse_string()
94 } else {
95 Err(self.create_error_for_current_token(ParseErrorKind::SingleQuotedStringsNotAllowed))
96 }
97 }
98 b'"' => self.parse_string(),
99 b'/' => match self.bytes.get(self.byte_index + 1) {
100 Some(b'/') => Ok(self.parse_comment_line()),
101 Some(b'*') => self.parse_comment_block(),
102 _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
103 },
104 b'-' | b'+' | b'0'..=b'9' => self.parse_number(),
105 b't' if self.try_move_word("true") => Ok(Token::Boolean(true)),
106 b'f' if self.try_move_word("false") => Ok(Token::Boolean(false)),
107 b'n' if self.try_move_word("null") => Ok(Token::Null),
108 _ => self.parse_word(),
109 };
110 match token_result {
111 Ok(token) => {
112 self.current_token = Some(token.clone());
113 Ok(Some(token))
114 }
115 Err(err) => Err(err),
116 }
117 } else {
118 self.current_token = None;
119 Ok(None)
120 }
121 }
122
123 pub fn token_start(&self) -> usize {
125 self.token_start
126 }
127
128 pub fn token_end(&self) -> usize {
130 self.byte_index
131 }
132
133 pub fn token(&self) -> Option<Token<'a>> {
135 self.current_token.as_ref().map(|x| x.to_owned())
136 }
137
138 pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
139 let end = if self.byte_index > self.token_start {
140 self.byte_index
142 } else if let Some(c) = self.file_text[self.byte_index..].chars().next() {
143 self.byte_index + c.len_utf8()
145 } else {
146 self.file_text.len()
147 };
148 let range = Range {
149 start: self.token_start,
150 end,
151 };
152 self.create_error_for_range(range, kind)
153 }
154
155 pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
156 self.create_error_for_start(self.byte_index, kind)
157 }
158
159 pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
160 let range = Range {
161 start,
162 end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
163 self.byte_index + c.len_utf8()
164 } else {
165 self.file_text.len()
166 },
167 };
168 self.create_error_for_range(range, kind)
169 }
170
171 pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
172 ParseError::new(range, kind, self.file_text)
173 }
174
175 fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
176 let quote = self.bytes[self.byte_index];
177 let start = self.byte_index + 1;
178
179 let mut i = start;
183 while i < self.bytes.len() {
184 let b = self.bytes[i];
185 if b == quote {
186 let s = &self.file_text[start..i];
188 self.byte_index = i + 1;
189 return Ok(Token::String(Cow::Borrowed(s)));
190 }
191 if b == b'\\' {
192 break;
193 }
194 i += 1;
195 }
196
197 crate::string::parse_string_with_char_provider(self)
199 .map(Token::String)
200 .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
202 }
203
204 fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
205 let start_byte_index = self.byte_index;
206
207 match self.bytes.get(self.byte_index) {
209 Some(b'+') => {
210 if !self.allow_unary_plus_numbers {
211 return Err(self.create_error_for_current_token(ParseErrorKind::UnaryPlusNumbersNotAllowed));
212 }
213 self.byte_index += 1;
214 }
215 Some(b'-') => {
216 self.byte_index += 1;
217 }
218 _ => {}
219 }
220
221 match self.bytes.get(self.byte_index) {
222 Some(b'0') => {
223 self.byte_index += 1;
224
225 if matches!(self.bytes.get(self.byte_index), Some(b'x' | b'X')) {
227 if !self.allow_hexadecimal_numbers {
228 return Err(self.create_error_for_current_token(ParseErrorKind::HexadecimalNumbersNotAllowed));
229 }
230
231 self.byte_index += 1;
232
233 if !matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
235 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
236 }
237
238 while matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
239 self.byte_index += 1;
240 }
241
242 return Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]));
243 }
244 }
245 Some(b'1'..=b'9') => {
246 self.byte_index += 1;
247 while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
248 self.byte_index += 1;
249 }
250 }
251 _ => {
252 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
253 }
254 }
255
256 if self.bytes.get(self.byte_index) == Some(&b'.') {
257 self.byte_index += 1;
258
259 if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
260 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
261 }
262
263 while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
264 self.byte_index += 1;
265 }
266 }
267
268 if matches!(self.bytes.get(self.byte_index), Some(b'e' | b'E')) {
269 self.byte_index += 1;
270
271 match self.bytes.get(self.byte_index) {
272 Some(b'-' | b'+') => {
273 self.byte_index += 1;
274 if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
275 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
276 }
277 }
278 Some(b'0'..=b'9') => {}
279 _ => {
280 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
281 }
282 }
283
284 while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
285 self.byte_index += 1;
286 }
287 }
288
289 Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]))
290 }
291
292 fn parse_comment_line(&mut self) -> Token<'a> {
293 debug_assert!(self.bytes[self.byte_index] == b'/');
294 self.byte_index += 1;
295 debug_assert!(self.bytes[self.byte_index] == b'/');
296 let start_byte_index = self.byte_index + 1;
297 self.byte_index += 1;
298
299 while let Some(&b) = self.bytes.get(self.byte_index) {
302 if b == b'\n' {
303 break;
304 }
305 if b == b'\r' && self.bytes.get(self.byte_index + 1) == Some(&b'\n') {
306 break;
307 }
308 self.byte_index += 1;
309 }
310
311 Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
312 }
313
314 fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
315 debug_assert!(self.bytes[self.byte_index] == b'/');
316 self.byte_index += 1;
317 debug_assert!(self.bytes[self.byte_index] == b'*');
318 let start_byte_index = self.byte_index + 1;
319 self.byte_index += 1;
320
321 loop {
323 match self.bytes.get(self.byte_index) {
324 Some(&b'*') if self.bytes.get(self.byte_index + 1) == Some(&b'/') => {
325 let end_byte_index = self.byte_index;
326 self.byte_index += 2;
327 return Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]));
328 }
329 Some(_) => self.byte_index += 1,
330 None => return Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock)),
331 }
332 }
333 }
334
335 fn skip_whitespace(&mut self) {
336 while let Some(&b) = self.bytes.get(self.byte_index) {
337 if b <= b' ' {
338 match b {
339 b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C => {
340 self.byte_index += 1;
341 continue;
342 }
343 _ => break,
344 }
345 } else if b >= 0x80 {
346 let c = self.file_text[self.byte_index..].chars().next().unwrap();
348 if c.is_whitespace() {
349 self.byte_index += c.len_utf8();
350 continue;
351 }
352 break;
353 } else {
354 break;
355 }
356 }
357 }
358
359 fn try_move_word(&mut self, text: &str) -> bool {
360 let text_bytes = text.as_bytes();
361 let end = self.byte_index + text_bytes.len();
362 if end > self.bytes.len() {
363 return false;
364 }
365 if &self.bytes[self.byte_index..end] != text_bytes {
366 return false;
367 }
368 if let Some(&next_byte) = self.bytes.get(end) {
370 if next_byte.is_ascii_alphanumeric() {
371 return false;
372 }
373 if next_byte >= 0x80
375 && let Some(c) = self.file_text[end..].chars().next()
376 && c.is_alphanumeric()
377 {
378 return false;
379 }
380 }
381 self.byte_index = end;
382 true
383 }
384
385 fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
386 let start_byte_index = self.byte_index;
387
388 while self.byte_index < self.bytes.len() {
389 let b = self.bytes[self.byte_index];
390 if b < 0x80 {
391 if b.is_ascii_whitespace() || b == b':' {
393 break;
394 }
395 if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
396 self.byte_index += 1;
397 } else {
398 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
399 }
400 } else {
401 let c = self.file_text[self.byte_index..].chars().next().unwrap();
403 if c.is_whitespace() {
404 break;
405 }
406 if c.is_alphanumeric() {
407 self.byte_index += c.len_utf8();
408 } else {
409 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
410 }
411 }
412 }
413
414 if self.byte_index == start_byte_index {
415 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
416 }
417
418 Ok(Token::Word(&self.file_text[start_byte_index..self.byte_index]))
419 }
420
421 fn current_char(&self) -> Option<char> {
422 let &b = self.bytes.get(self.byte_index)?;
423 if b < 0x80 {
424 Some(b as char)
425 } else {
426 self.file_text[self.byte_index..].chars().next()
427 }
428 }
429
430 fn move_next_char(&mut self) -> Option<char> {
431 if self.byte_index >= self.bytes.len() {
432 return None;
433 }
434 let b = self.bytes[self.byte_index];
435 if b < 0x80 {
436 self.byte_index += 1;
437 } else {
438 let c = self.file_text[self.byte_index..].chars().next().unwrap();
439 self.byte_index += c.len_utf8();
440 }
441 self.current_char()
442 }
443}
444
445impl<'a> CharProvider<'a> for Scanner<'a> {
446 fn current_char(&mut self) -> Option<char> {
447 Scanner::current_char(self)
448 }
449
450 fn move_next_char(&mut self) -> Option<char> {
451 Scanner::move_next_char(self)
452 }
453
454 fn byte_index(&self) -> usize {
455 self.byte_index
456 }
457
458 fn text(&self) -> &'a str {
459 self.file_text
460 }
461}
462
463#[cfg(test)]
464mod tests {
465 use std::borrow::Cow;
466
467 use super::super::tokens::Token;
468 use super::*;
469 use pretty_assertions::assert_eq;
470
471 #[test]
472 fn it_tokenizes_string() {
473 assert_has_tokens(
474 r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
475 vec![
476 Token::String(Cow::Borrowed(r#"t"est"#)),
477 Token::Comma,
478 Token::String(Cow::Borrowed("\t\r\n\n test\n other")),
479 Token::Comma,
480 ],
481 );
482 }
483
484 #[test]
485 fn it_errors_escaping_single_quote_in_double_quote() {
486 assert_has_error(
487 r#""t\'est""#,
488 "Invalid escape in double quote string on line 1 column 3",
489 );
490 }
491
492 #[test]
493 fn it_tokenizes_single_quote_string() {
494 assert_has_tokens(
495 r#"'t\'est','a',"#,
496 vec![
497 Token::String(Cow::Borrowed(r#"t'est"#)),
498 Token::Comma,
499 Token::String(Cow::Borrowed("a")),
500 Token::Comma,
501 ],
502 );
503 }
504
505 #[test]
506 fn it_errors_escaping_double_quote_in_single_quote() {
507 assert_has_error(
508 r#"'t\"est'"#,
509 "Invalid escape in single quote string on line 1 column 3",
510 );
511 }
512
513 #[test]
514 fn it_errors_for_word_starting_with_invalid_token() {
515 assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
516 }
517
518 #[test]
519 fn it_tokenizes_numbers() {
520 assert_has_tokens(
521 "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
522 vec![
523 Token::Number("0"),
524 Token::Comma,
525 Token::Number("0.123"),
526 Token::Comma,
527 Token::Number("-198"),
528 Token::Comma,
529 Token::Number("0e-345"),
530 Token::Comma,
531 Token::Number("0.3e+025"),
532 Token::Comma,
533 Token::Number("1e1"),
534 Token::Comma,
535 ],
536 );
537 }
538
539 #[test]
540 fn it_tokenizes_hexadecimal_numbers() {
541 assert_has_tokens(
542 "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
543 vec![
544 Token::Number("0x7DF"),
545 Token::Comma,
546 Token::Number("0xFF"),
547 Token::Comma,
548 Token::Number("0x123ABC"),
549 Token::Comma,
550 Token::Number("0xabc"),
551 Token::Comma,
552 Token::Number("0X1F"),
553 ],
554 );
555 }
556
557 #[test]
558 fn it_tokenizes_unary_plus_numbers() {
559 assert_has_tokens(
560 "+42, +0.5, +1e10, +0xFF",
561 vec![
562 Token::Number("+42"),
563 Token::Comma,
564 Token::Number("+0.5"),
565 Token::Comma,
566 Token::Number("+1e10"),
567 Token::Comma,
568 Token::Number("+0xFF"),
569 ],
570 );
571 }
572
573 #[test]
574 fn it_errors_invalid_exponent() {
575 assert_has_error(
576 r#"1ea"#,
577 "Expected plus, minus, or digit in number literal on line 1 column 3",
578 );
579 assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
580 }
581
582 #[test]
583 fn it_tokenizes_simple_tokens() {
584 assert_has_tokens(
585 "{}[],:true,false,null,",
586 vec![
587 Token::OpenBrace,
588 Token::CloseBrace,
589 Token::OpenBracket,
590 Token::CloseBracket,
591 Token::Comma,
592 Token::Colon,
593 Token::Boolean(true),
594 Token::Comma,
595 Token::Boolean(false),
596 Token::Comma,
597 Token::Null,
598 Token::Comma,
599 ],
600 );
601 }
602
603 #[test]
604 fn it_tokenizes_comment_line() {
605 assert_has_tokens(
606 "//test\n//t\r\n// test\n,",
607 vec![
608 Token::CommentLine("test"),
609 Token::CommentLine("t"),
610 Token::CommentLine(" test"),
611 Token::Comma,
612 ],
613 );
614 }
615
616 #[test]
617 fn it_tokenizes_comment_blocks() {
618 assert_has_tokens(
619 "/*test\n *//* test*/,",
620 vec![
621 Token::CommentBlock("test\n "),
622 Token::CommentBlock(" test"),
623 Token::Comma,
624 ],
625 );
626 }
627
628 #[test]
629 fn it_errors_on_invalid_utf8_char_for_issue_6() {
630 assert_has_error(
631 "\"\\uDF06\"",
632 "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
633 );
634 }
635
636 fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
637 let mut scanner = Scanner::new(text, &Default::default());
638 let mut scanned_tokens = Vec::new();
639
640 loop {
641 match scanner.scan() {
642 Ok(Some(token)) => scanned_tokens.push(token),
643 Ok(None) => break,
644 Err(err) => panic!("Error parsing: {:?}", err),
645 }
646 }
647
648 assert_eq!(scanned_tokens, tokens);
649 }
650
651 fn assert_has_error(text: &str, message: &str) {
652 let mut scanner = Scanner::new(text, &Default::default());
653 let mut error_message = String::new();
654
655 loop {
656 match scanner.scan() {
657 Ok(Some(_)) => {}
658 Ok(None) => break,
659 Err(err) => {
660 error_message = err.to_string();
661 break;
662 }
663 }
664 }
665
666 assert_eq!(error_message, message);
667 }
668}