1use std::borrow::Cow;
2
3use crate::string::CharProvider;
4
5use super::common::Range;
6use super::errors::*;
7use super::tokens::Token;
8
9pub struct Scanner<'a> {
11 byte_index: usize,
12 token_start: usize,
13 bytes: &'a [u8],
14 current_token: Option<Token<'a>>,
15 file_text: &'a str,
16 allow_single_quoted_strings: bool,
17 allow_hexadecimal_numbers: bool,
18 allow_unary_plus_numbers: bool,
19}
20
21#[derive(Debug)]
23pub struct ScannerOptions {
24 pub allow_single_quoted_strings: bool,
26 pub allow_hexadecimal_numbers: bool,
28 pub allow_unary_plus_numbers: bool,
30}
31
32impl Default for ScannerOptions {
33 fn default() -> Self {
34 Self {
35 allow_single_quoted_strings: true,
36 allow_hexadecimal_numbers: true,
37 allow_unary_plus_numbers: true,
38 }
39 }
40}
41
42impl<'a> Scanner<'a> {
43 pub fn new(file_text: &'a str, options: &ScannerOptions) -> Scanner<'a> {
45 Scanner {
46 byte_index: 0,
47 token_start: 0,
48 bytes: file_text.as_bytes(),
49 current_token: None,
50 file_text,
51 allow_single_quoted_strings: options.allow_single_quoted_strings,
52 allow_hexadecimal_numbers: options.allow_hexadecimal_numbers,
53 allow_unary_plus_numbers: options.allow_unary_plus_numbers,
54 }
55 }
56
57 pub fn file_text(&self) -> &str {
58 self.file_text
59 }
60
61 pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
63 self.skip_whitespace();
64 self.token_start = self.byte_index;
65 if let Some(&b) = self.bytes.get(self.byte_index) {
66 let token_result = match b {
67 b'{' => {
68 self.byte_index += 1;
69 Ok(Token::OpenBrace)
70 }
71 b'}' => {
72 self.byte_index += 1;
73 Ok(Token::CloseBrace)
74 }
75 b'[' => {
76 self.byte_index += 1;
77 Ok(Token::OpenBracket)
78 }
79 b']' => {
80 self.byte_index += 1;
81 Ok(Token::CloseBracket)
82 }
83 b',' => {
84 self.byte_index += 1;
85 Ok(Token::Comma)
86 }
87 b':' => {
88 self.byte_index += 1;
89 Ok(Token::Colon)
90 }
91 b'\'' => {
92 if self.allow_single_quoted_strings {
93 self.parse_string()
94 } else {
95 Err(self.create_error_for_current_token(ParseErrorKind::SingleQuotedStringsNotAllowed))
96 }
97 }
98 b'"' => self.parse_string(),
99 b'/' => match self.bytes.get(self.byte_index + 1) {
100 Some(b'/') => Ok(self.parse_comment_line()),
101 Some(b'*') => self.parse_comment_block(),
102 _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
103 },
104 b'-' | b'+' | b'0'..=b'9' => self.parse_number(),
105 b't' if self.try_move_word("true") => Ok(Token::Boolean(true)),
106 b'f' if self.try_move_word("false") => Ok(Token::Boolean(false)),
107 b'n' if self.try_move_word("null") => Ok(Token::Null),
108 _ => self.parse_word(),
109 };
110 match token_result {
111 Ok(token) => {
112 self.current_token = Some(token.clone());
113 Ok(Some(token))
114 }
115 Err(err) => Err(err),
116 }
117 } else {
118 self.current_token = None;
119 Ok(None)
120 }
121 }
122
123 pub fn token_start(&self) -> usize {
125 self.token_start
126 }
127
128 pub fn token_end(&self) -> usize {
130 self.byte_index
131 }
132
133 pub fn token(&self) -> Option<Token<'a>> {
135 self.current_token.as_ref().map(|x| x.to_owned())
136 }
137
138 pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
139 self.create_error_for_start(self.token_start, kind)
140 }
141
142 pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
143 self.create_error_for_start(self.byte_index, kind)
144 }
145
146 pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
147 let range = Range {
148 start,
149 end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
150 self.byte_index + c.len_utf8()
151 } else {
152 self.file_text.len()
153 },
154 };
155 self.create_error_for_range(range, kind)
156 }
157
158 pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
159 ParseError::new(range, kind, self.file_text)
160 }
161
162 fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
163 let quote = self.bytes[self.byte_index];
164 let start = self.byte_index + 1;
165
166 let mut i = start;
170 while i < self.bytes.len() {
171 let b = self.bytes[i];
172 if b == quote {
173 let s = &self.file_text[start..i];
175 self.byte_index = i + 1;
176 return Ok(Token::String(Cow::Borrowed(s)));
177 }
178 if b == b'\\' {
179 break;
180 }
181 i += 1;
182 }
183
184 crate::string::parse_string_with_char_provider(self)
186 .map(Token::String)
187 .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
189 }
190
191 fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
192 let start_byte_index = self.byte_index;
193
194 match self.bytes.get(self.byte_index) {
196 Some(b'+') => {
197 if !self.allow_unary_plus_numbers {
198 return Err(self.create_error_for_current_token(ParseErrorKind::UnaryPlusNumbersNotAllowed));
199 }
200 self.byte_index += 1;
201 }
202 Some(b'-') => {
203 self.byte_index += 1;
204 }
205 _ => {}
206 }
207
208 match self.bytes.get(self.byte_index) {
209 Some(b'0') => {
210 self.byte_index += 1;
211
212 if matches!(self.bytes.get(self.byte_index), Some(b'x' | b'X')) {
214 if !self.allow_hexadecimal_numbers {
215 return Err(self.create_error_for_current_token(ParseErrorKind::HexadecimalNumbersNotAllowed));
216 }
217
218 self.byte_index += 1;
219
220 if !matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
222 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
223 }
224
225 while matches!(self.bytes.get(self.byte_index), Some(b) if b.is_ascii_hexdigit()) {
226 self.byte_index += 1;
227 }
228
229 return Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]));
230 }
231 }
232 Some(b'1'..=b'9') => {
233 self.byte_index += 1;
234 while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
235 self.byte_index += 1;
236 }
237 }
238 _ => {
239 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
240 }
241 }
242
243 if self.bytes.get(self.byte_index) == Some(&b'.') {
244 self.byte_index += 1;
245
246 if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
247 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
248 }
249
250 while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
251 self.byte_index += 1;
252 }
253 }
254
255 if matches!(self.bytes.get(self.byte_index), Some(b'e' | b'E')) {
256 self.byte_index += 1;
257
258 match self.bytes.get(self.byte_index) {
259 Some(b'-' | b'+') => {
260 self.byte_index += 1;
261 if !matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
262 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
263 }
264 }
265 Some(b'0'..=b'9') => {}
266 _ => {
267 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
268 }
269 }
270
271 while matches!(self.bytes.get(self.byte_index), Some(b'0'..=b'9')) {
272 self.byte_index += 1;
273 }
274 }
275
276 Ok(Token::Number(&self.file_text[start_byte_index..self.byte_index]))
277 }
278
279 fn parse_comment_line(&mut self) -> Token<'a> {
280 debug_assert!(self.bytes[self.byte_index] == b'/');
281 self.byte_index += 1;
282 debug_assert!(self.bytes[self.byte_index] == b'/');
283 let start_byte_index = self.byte_index + 1;
284 self.byte_index += 1;
285
286 while let Some(&b) = self.bytes.get(self.byte_index) {
289 if b == b'\n' {
290 break;
291 }
292 if b == b'\r' && self.bytes.get(self.byte_index + 1) == Some(&b'\n') {
293 break;
294 }
295 self.byte_index += 1;
296 }
297
298 Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
299 }
300
301 fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
302 debug_assert!(self.bytes[self.byte_index] == b'/');
303 self.byte_index += 1;
304 debug_assert!(self.bytes[self.byte_index] == b'*');
305 let start_byte_index = self.byte_index + 1;
306 self.byte_index += 1;
307
308 loop {
310 match self.bytes.get(self.byte_index) {
311 Some(&b'*') if self.bytes.get(self.byte_index + 1) == Some(&b'/') => {
312 let end_byte_index = self.byte_index;
313 self.byte_index += 2;
314 return Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]));
315 }
316 Some(_) => self.byte_index += 1,
317 None => return Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock)),
318 }
319 }
320 }
321
322 fn skip_whitespace(&mut self) {
323 while let Some(&b) = self.bytes.get(self.byte_index) {
324 if b <= b' ' {
325 match b {
326 b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C => {
327 self.byte_index += 1;
328 continue;
329 }
330 _ => break,
331 }
332 } else if b >= 0x80 {
333 let c = self.file_text[self.byte_index..].chars().next().unwrap();
335 if c.is_whitespace() {
336 self.byte_index += c.len_utf8();
337 continue;
338 }
339 break;
340 } else {
341 break;
342 }
343 }
344 }
345
346 fn try_move_word(&mut self, text: &str) -> bool {
347 let text_bytes = text.as_bytes();
348 let end = self.byte_index + text_bytes.len();
349 if end > self.bytes.len() {
350 return false;
351 }
352 if &self.bytes[self.byte_index..end] != text_bytes {
353 return false;
354 }
355 if let Some(&next_byte) = self.bytes.get(end) {
357 if next_byte.is_ascii_alphanumeric() {
358 return false;
359 }
360 if next_byte >= 0x80
362 && let Some(c) = self.file_text[end..].chars().next()
363 && c.is_alphanumeric()
364 {
365 return false;
366 }
367 }
368 self.byte_index = end;
369 true
370 }
371
372 fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
373 let start_byte_index = self.byte_index;
374
375 while self.byte_index < self.bytes.len() {
376 let b = self.bytes[self.byte_index];
377 if b < 0x80 {
378 if b.is_ascii_whitespace() || b == b':' {
380 break;
381 }
382 if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
383 self.byte_index += 1;
384 } else {
385 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
386 }
387 } else {
388 let c = self.file_text[self.byte_index..].chars().next().unwrap();
390 if c.is_whitespace() {
391 break;
392 }
393 if c.is_alphanumeric() {
394 self.byte_index += c.len_utf8();
395 } else {
396 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
397 }
398 }
399 }
400
401 if self.byte_index == start_byte_index {
402 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
403 }
404
405 Ok(Token::Word(&self.file_text[start_byte_index..self.byte_index]))
406 }
407
408 fn current_char(&self) -> Option<char> {
409 let &b = self.bytes.get(self.byte_index)?;
410 if b < 0x80 {
411 Some(b as char)
412 } else {
413 self.file_text[self.byte_index..].chars().next()
414 }
415 }
416
417 fn move_next_char(&mut self) -> Option<char> {
418 if self.byte_index >= self.bytes.len() {
419 return None;
420 }
421 let b = self.bytes[self.byte_index];
422 if b < 0x80 {
423 self.byte_index += 1;
424 } else {
425 let c = self.file_text[self.byte_index..].chars().next().unwrap();
426 self.byte_index += c.len_utf8();
427 }
428 self.current_char()
429 }
430}
431
432impl<'a> CharProvider<'a> for Scanner<'a> {
433 fn current_char(&mut self) -> Option<char> {
434 Scanner::current_char(self)
435 }
436
437 fn move_next_char(&mut self) -> Option<char> {
438 Scanner::move_next_char(self)
439 }
440
441 fn byte_index(&self) -> usize {
442 self.byte_index
443 }
444
445 fn text(&self) -> &'a str {
446 self.file_text
447 }
448}
449
450#[cfg(test)]
451mod tests {
452 use std::borrow::Cow;
453
454 use super::super::tokens::Token;
455 use super::*;
456 use pretty_assertions::assert_eq;
457
458 #[test]
459 fn it_tokenizes_string() {
460 assert_has_tokens(
461 r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
462 vec![
463 Token::String(Cow::Borrowed(r#"t"est"#)),
464 Token::Comma,
465 Token::String(Cow::Borrowed("\t\r\n\n test\n other")),
466 Token::Comma,
467 ],
468 );
469 }
470
471 #[test]
472 fn it_errors_escaping_single_quote_in_double_quote() {
473 assert_has_error(
474 r#""t\'est""#,
475 "Invalid escape in double quote string on line 1 column 3",
476 );
477 }
478
479 #[test]
480 fn it_tokenizes_single_quote_string() {
481 assert_has_tokens(
482 r#"'t\'est','a',"#,
483 vec![
484 Token::String(Cow::Borrowed(r#"t'est"#)),
485 Token::Comma,
486 Token::String(Cow::Borrowed("a")),
487 Token::Comma,
488 ],
489 );
490 }
491
492 #[test]
493 fn it_errors_escaping_double_quote_in_single_quote() {
494 assert_has_error(
495 r#"'t\"est'"#,
496 "Invalid escape in single quote string on line 1 column 3",
497 );
498 }
499
500 #[test]
501 fn it_errors_for_word_starting_with_invalid_token() {
502 assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
503 }
504
505 #[test]
506 fn it_tokenizes_numbers() {
507 assert_has_tokens(
508 "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
509 vec![
510 Token::Number("0"),
511 Token::Comma,
512 Token::Number("0.123"),
513 Token::Comma,
514 Token::Number("-198"),
515 Token::Comma,
516 Token::Number("0e-345"),
517 Token::Comma,
518 Token::Number("0.3e+025"),
519 Token::Comma,
520 Token::Number("1e1"),
521 Token::Comma,
522 ],
523 );
524 }
525
526 #[test]
527 fn it_tokenizes_hexadecimal_numbers() {
528 assert_has_tokens(
529 "0x7DF, 0xFF, 0x123ABC, 0xabc, 0X1F",
530 vec![
531 Token::Number("0x7DF"),
532 Token::Comma,
533 Token::Number("0xFF"),
534 Token::Comma,
535 Token::Number("0x123ABC"),
536 Token::Comma,
537 Token::Number("0xabc"),
538 Token::Comma,
539 Token::Number("0X1F"),
540 ],
541 );
542 }
543
544 #[test]
545 fn it_tokenizes_unary_plus_numbers() {
546 assert_has_tokens(
547 "+42, +0.5, +1e10, +0xFF",
548 vec![
549 Token::Number("+42"),
550 Token::Comma,
551 Token::Number("+0.5"),
552 Token::Comma,
553 Token::Number("+1e10"),
554 Token::Comma,
555 Token::Number("+0xFF"),
556 ],
557 );
558 }
559
560 #[test]
561 fn it_errors_invalid_exponent() {
562 assert_has_error(
563 r#"1ea"#,
564 "Expected plus, minus, or digit in number literal on line 1 column 3",
565 );
566 assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
567 }
568
569 #[test]
570 fn it_tokenizes_simple_tokens() {
571 assert_has_tokens(
572 "{}[],:true,false,null,",
573 vec![
574 Token::OpenBrace,
575 Token::CloseBrace,
576 Token::OpenBracket,
577 Token::CloseBracket,
578 Token::Comma,
579 Token::Colon,
580 Token::Boolean(true),
581 Token::Comma,
582 Token::Boolean(false),
583 Token::Comma,
584 Token::Null,
585 Token::Comma,
586 ],
587 );
588 }
589
590 #[test]
591 fn it_tokenizes_comment_line() {
592 assert_has_tokens(
593 "//test\n//t\r\n// test\n,",
594 vec![
595 Token::CommentLine("test"),
596 Token::CommentLine("t"),
597 Token::CommentLine(" test"),
598 Token::Comma,
599 ],
600 );
601 }
602
603 #[test]
604 fn it_tokenizes_comment_blocks() {
605 assert_has_tokens(
606 "/*test\n *//* test*/,",
607 vec![
608 Token::CommentBlock("test\n "),
609 Token::CommentBlock(" test"),
610 Token::Comma,
611 ],
612 );
613 }
614
615 #[test]
616 fn it_errors_on_invalid_utf8_char_for_issue_6() {
617 assert_has_error(
618 "\"\\uDF06\"",
619 "Invalid unicode escape sequence. 'DF06 (unpaired low surrogate)' is not a valid UTF8 character on line 1 column 2",
620 );
621 }
622
623 fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
624 let mut scanner = Scanner::new(text, &Default::default());
625 let mut scanned_tokens = Vec::new();
626
627 loop {
628 match scanner.scan() {
629 Ok(Some(token)) => scanned_tokens.push(token),
630 Ok(None) => break,
631 Err(err) => panic!("Error parsing: {:?}", err),
632 }
633 }
634
635 assert_eq!(scanned_tokens, tokens);
636 }
637
638 fn assert_has_error(text: &str, message: &str) {
639 let mut scanner = Scanner::new(text, &Default::default());
640 let mut error_message = String::new();
641
642 loop {
643 match scanner.scan() {
644 Ok(Some(_)) => {}
645 Ok(None) => break,
646 Err(err) => {
647 error_message = err.to_string();
648 break;
649 }
650 }
651 }
652
653 assert_eq!(error_message, message);
654 }
655}