1use std::fmt::{Display, Formatter};
2use std::iter::{Peekable};
3use std::str::{CharIndices};
4use crate::utils::get_line_col_char;
5
6#[derive(PartialEq, Clone, Debug)]
7pub enum TokType {
8 LeftBrace,
9 RightBrace,
10 LeftBracket,
11 RightBracket,
12 Comma,
13 Colon,
14 Name,
15 SingleQuotedString,
16 DoubleQuotedString,
17 BlockComment,
18 LineComment,
19 Whitespace,
20 True,
21 False,
22 Null,
23 Integer,
24 Float,
25 Infinity,
26 Nan,
27 Exponent,
28 Hexadecimal,
29 Plus,
31 Minus,
32 EOF,
33}
34
35
36pub(crate) type TokenSpan = (usize, TokType, usize);
38
39
40
41
42#[derive(Debug, PartialEq)]
43pub struct Tokens<'input> {
44 pub tok_spans: Vec<TokenSpan>,
45 pub(crate) source: &'input str
46}
47
48
49#[derive(Debug)]
50pub struct TokenizationError {
51 pub message: String,
52 pub index: usize, pub lineno: usize,
54 pub colno: usize,
55 pub char_index: usize }
57
58impl<'input> Display for TokenizationError {
59 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
60 write!(f, "TokenizationError: {}: line {} column {} (char {})", self.message, self.lineno, self.colno, self.char_index)
61 }
62}
63
64#[derive(Debug)]
65pub(crate) struct Tokenizer<'input> {
66 configuration: TokenizerConfig,
67 text: &'input str,
68 chars: Peekable<CharIndices<'input>>,
69 lookahead: Option<(usize, char)>,
70}
71
72
73const HEX_CHARS: &str = "0123456789abcdefABCDEF";
74const IDENTIFIER_START_SYMBOLS: &str = "$_";
75const IDENTIFIER_PARTS: &str = "$_\u{200C}\u{200D}\u{005F}\u{203F}\u{2040}\u{2054}\u{FE33}\u{FE34}\u{FE4D}\u{FE4E}\u{FE4F}\u{FF3F}";
76#[derive(Debug)]
77pub struct TokenizerConfig {
78 pub include_whitespace: bool,
79 pub include_comments: bool,
80 pub allow_octal: bool,
81}
82
83impl TokenizerConfig {
84 pub fn new() -> Self {
85 TokenizerConfig {include_whitespace: false, include_comments: false, allow_octal: false}
86 }
87}
88
89impl <'input> Tokenizer<'input> {
90 pub fn new(text: &'input str) -> Self {
91 Tokenizer {configuration: TokenizerConfig::new(), text: text, chars: text.char_indices().peekable(), lookahead: None}
92 }
93
94 pub fn with_configuration(text: &'input str, configuration: TokenizerConfig) -> Self {
95 Tokenizer {configuration: configuration, text: text, chars: text.char_indices().peekable(), lookahead: None}
96 }
97
98 fn advance(&mut self) -> Option<(usize, char)> {
99 self.lookahead = self.chars.next();
100 self.lookahead
101 }
102
103 fn make_error(&self, message: String, start_index: usize) -> TokenizationError {
104 let (lineno, colno, char_index) = get_line_col_char(self.text, start_index);
105 TokenizationError{message: message, index: start_index, lineno: lineno, colno: colno, char_index: char_index}
106 }
107
108 fn process_string(&mut self) -> Result<TokenSpan, TokenizationError> {
109 let (start_idx, quote_char) = self.lookahead.expect("Expected quote character");
110
111 let string_type: TokType = match quote_char {
112 '"' => TokType::DoubleQuotedString,
113 '\'' => TokType::SingleQuotedString,
114 _ => unreachable!("Expected quote character, but got {:?}", quote_char)
115 };
116
117 let mut last_char = quote_char;
118
119 let mut escaping = false;
120 loop {
121 match self.advance() {
122 None => {
123 break Err(self.make_error("Unterminated string starting at".to_string(), start_idx))
124 },
125 Some((idx, char)) => {
126 match char {
127 '\\' => {
128 escaping = !escaping;
129 last_char = char;
130 continue
131 }
132 '\n' => {
133 if !escaping && last_char != '\r' {
134 break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
135 }
136 escaping = false;
137 last_char = char;
138 continue
139 }
140 '\r' | '\u{2028}' | '\u{2029}' => {
141 if !escaping {
142 break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
143 }
144 escaping = false;
145 last_char = char;
146 continue
147 },
148 c if c == quote_char && !escaping => {
149 break Ok((start_idx, string_type, idx+1))
150 },
151 _ => {
152 escaping = false;
153 last_char = char;
154 continue
155 }
156 }
157 }
158 }
159 }
160 }
161
162 fn process_whitespace(&mut self) -> Result<TokenSpan, TokenizationError> {
163 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting whitespace char");
164 let mut last_index = start_idx;
165 let mut last_char = start_char;
166 loop {
167 match self.chars.peek() {
168 None => break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8())),
169 Some((peeked_idx, peeked_char)) => {
170 if peeked_char.is_whitespace() {
171 last_index = *peeked_idx;
172 last_char = *peeked_char;
173 self.advance();
174 continue
175 } else {
176 break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8()))
177 }
178 }
179 }
180 }
181 }
182
183 fn process_octal(&mut self) -> Result<TokenSpan, TokenizationError> {
184 let (start_idx, _start_char) = self.lookahead.expect("Unexpected end of input, was processing octal");
185 if self.configuration.allow_octal {
186 todo!()
187 } else {
188 Err(self.make_error("Octal literals are forbidden".to_string(), start_idx))
189 }
190 }
191
192 fn process_hexadecimal(&mut self) -> Result<TokenSpan, TokenizationError> {
193 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
194 let (_, x_char) = self.advance().expect("Expected hex x");
195 assert_eq!(start_char, '0');
196 if x_char != 'x' && x_char != 'X' {
197 unreachable!("Invalid hexadecimal here")
198 }
199
200 match self.advance() {
201 None => {
202 return Err(self.make_error("Expected at least one digit in hexadecimal literal".to_string(), start_idx))
203 }
204 Some((mut last_idx, first_digit)) => {
205 if !HEX_CHARS.contains(first_digit) {
206 return Err(self.make_error(format!("Invalid hexadecimal character {:?} in literal starting at", first_digit), start_idx))
207 }
208 loop {
209 match self.chars.peek() {
210 None => break Ok((start_idx, TokType::Hexadecimal, last_idx+1)),
211 Some((offset, char)) => {
212 if !HEX_CHARS.contains(*char) {
213 break Ok((start_idx, TokType::Hexadecimal, last_idx+1))
214 }
215 last_idx = *offset;
216 self.advance();
217 continue
218 }
219 }
220 }
221 }
222 }
223 }
224
225 fn process_number(&mut self) -> Result<TokenSpan, TokenizationError>{
226 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
227 let mut last_index = start_idx;
228 let mut decimal_seen: bool = false;
229 let mut exponent_seen: bool = false;
230 let mut unary_seen: bool = false;
231 if start_char == '.' {
232 decimal_seen = true
233 }
234
235 let maybe_second_char = self.chars.peek();
236 match maybe_second_char {
237 None => {
238 if decimal_seen {
239 return Err(self.make_error("Lone decimal is an invalid literal".to_string(), start_idx))
240 }
241 return Ok((start_idx, TokType::Integer, start_idx + 1))
242 },
243 Some((_second_idx, second_char)) if start_char == '0' => {
244 match second_char {
245 'x' | 'X' => {return self.process_hexadecimal()}
246 sc if sc.is_ascii_digit() => {
247 return self.process_octal()
248 },
249 _ => {}
250 }
251 }
252 _ => {}
253 }
254
255 loop {
256 match self.chars.peek() {
257 None => {
258 if unary_seen || exponent_seen {
259 let (_, last_char) = self.lookahead.unwrap();
260 if "+-eE".contains(last_char) {
261 return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
262 }
263 }
264 if exponent_seen {
265 break Ok((start_idx, TokType::Exponent, last_index+1))
266 } else if decimal_seen {
267 if start_idx == last_index {
268 return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
269 }
270 break Ok((start_idx, TokType::Float, last_index+1))
271 } else {
272 break Ok((start_idx, TokType::Integer, last_index+1))
273 }
274 },
275 Some((next_idx, next_char)) => {
276 match *next_char {
277 c if c.is_ascii_digit() => {
278 last_index = *next_idx;
279 self.advance();
280 continue
281 },
282 '.' => {
283 if decimal_seen {
284 return Err(self.make_error("Invalid number literal (unexpected decimal)".to_string(), start_idx))
285 }
286 decimal_seen = true;
287 if exponent_seen {
288 return Err(self.make_error("Invalid exponent literal (float exponents forbidden) at".to_string(), start_idx))
289 }
290 last_index = *next_idx;
291 self.advance();
292 continue
293 },
294 'e' | 'E' => {
295 if exponent_seen {
296 return Err(self.make_error("Invalid number literal (only one exponent part is allowed)".to_string(), start_idx))
297 }
298 exponent_seen = true;
299 last_index = *next_idx;
300 self.advance();
301 }
302 '+' | '-' => {
303 let (_, previous_char) = self.lookahead.unwrap();
304 unary_seen = true;
305 match previous_char {
306 'e' | 'E' => {
307 last_index = *next_idx;
308 self.advance();
309 }
310 _ => {
311 return Err(self.make_error("Unary within number literal only allowed after exponent part".to_string(), start_idx))
312 }
313 }
314 }
315 _ => {
316 if unary_seen || exponent_seen {
319 let (_, last_char) = self.lookahead.unwrap();
320 if "+-eE".contains(last_char) {
321 return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
322 }
323 }
324 if exponent_seen {
325 break Ok((start_idx, TokType::Exponent, last_index+1))
326 } else if decimal_seen {
327 if start_idx == last_index {
328 return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
329 }
330 break Ok((start_idx, TokType::Float, last_index+1))
331 } else {
332 break Ok((start_idx, TokType::Integer, last_index+1))
333 }
334 }
335 }
336 }
337 }
338 }
339
340 }
341
342 fn tok_from_indices(&self, start: usize, end: usize) -> Result<TokenSpan, TokenizationError> {
343 let lexeme= &self.text[start .. end];
344 match lexeme {
345 "true" => Ok((start, TokType::True, end)),
346 "false" => Ok((start, TokType::False, end)),
347 "NaN" => Ok((start, TokType::Nan, end)),
348 "Infinity" => Ok((start, TokType::Infinity, end)),
349 "null" => Ok((start, TokType::Null, end)),
350 _ => {
351 Ok((start, TokType::Name, end))
352 }
353 }
354 }
355
356 fn process_identifier_or_const(&mut self) -> Result<TokenSpan, TokenizationError> {
357 use crate::utils::read_hex_digits;
358 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting identifier/const char");
359 let mut last_idx = start_idx;
360 use unicode_general_category::{get_general_category, GeneralCategory};
361 match start_char {
362 c if c.is_alphabetic() => {}
363 c if IDENTIFIER_START_SYMBOLS.contains(c) => {}
364 '\\' => {
365 match self.chars.peek() {
366 None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
367 Some((_, c)) => {
368 match c {
369 'u' => {
370 let mut ubuffer = String::with_capacity(4);
371 self.advance();
372 for _ in 0..4 {
373 match self.advance() {
374 None => {
375 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
376 }
377 Some((idx, c)) => {
378 ubuffer.push(c);
379 last_idx = idx;
380 if !HEX_CHARS.contains(c) {
381 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
382 }
383 }
384 }
385 }
386 let maybe_hex_val = read_hex_digits(&mut ubuffer.chars().peekable(), 4, ubuffer.as_str());
387 match maybe_hex_val {
388 Err(_) => {
389 return Err(self.make_error(format!("invalid unicode escape: \\u{}", ubuffer), start_idx))
390 }
391 Ok(hex_val) => {
392 let maybe_c = char::from_u32(hex_val);
393 match maybe_c {
394 None => {
395 return Err(self.make_error(format!("invalid unicode escape value: \\u{}", ubuffer), start_idx))
396 }
397 Some(c) => {
398 if !c.is_alphabetic() && !IDENTIFIER_START_SYMBOLS.contains(c) {
399 return Err(self.make_error(format!("Illegal identifier start from unicode escape sequence: \\u{}", ubuffer), start_idx))
400 }
401 }
402 }
403 }
404 }
405 }
406 _ => {
407 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
408 }
409 }
410 }
411 }
412 }
413 _ => {
414 return Err(self.make_error(format!("Invalid character {}", start_char), start_idx))
415 }
416 }
417 let mut last_char = start_char;
418 loop {
419 match self.chars.peek() {
420 None => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8()),
421 Some((next_idx, next_char)) => {
422 if next_char.is_whitespace() {
423 break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
424 } else if next_char.is_alphanumeric() {
425 last_idx = *next_idx;
426 last_char = *next_char;
427 self.advance();
428 continue
429 } else if IDENTIFIER_PARTS.contains(*next_char) {
430 last_idx = *next_idx;
431 last_char = *next_char;
432 self.advance();
433 continue
434 } else if *next_char == '\\' {
435 self.advance();
436 match self.advance() {
437 None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
438 Some((_, c)) => {
439 match c {
440 'u' => {
441 for _ in 0..4 {
442 match self.advance() {
443 None => {
444 return Err(self.make_error("Invalid unquoted key1".to_string(), start_idx))
445 }
446 Some((_, c)) => {
447 if !HEX_CHARS.contains(c) {
448
449 return Err(self.make_error("Invalid unquoted key2".to_string(), start_idx))
450 }
451
452 }
453 }
454 }
455 (last_idx, last_char) = self.lookahead.unwrap()
456 }
457 _ => {
458 return Err(self.make_error("Invalid unquoted key3".to_string(), start_idx))
459 }
460 }
461 }
462 }
463 } else {
464 match get_general_category(*next_char) {
465 GeneralCategory::NonspacingMark | GeneralCategory::SpacingMark => {
466 last_idx = *next_idx;
467 last_char = *next_char;
468 self.advance();
469 continue
470 }
471 _ => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
472 }
473 }
474 }
475 }
476 }
477 }
478
479 fn process_comment(&mut self) -> Result<TokenSpan, TokenizationError> {
480 let (start_idx, _char) = self.lookahead.expect("Expected comment start");
481 let (mut last_idx, star_or_slash) = self.advance().expect("Expected second comment char");
482 match star_or_slash {
483 '/' => {
484 loop {
486 match self.chars.peek() {
487 None => {
488 return Ok((start_idx, TokType::LineComment, last_idx+1))
489 },
490 Some((peeked_idx, peeked_char)) => {
491 match peeked_char {
492 '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
493 (last_idx, _) = self.advance().unwrap();
494 return Ok((start_idx, TokType::LineComment, last_idx+1))
495 }
496 _ => {
497 last_idx = *peeked_idx;
498 self.advance();
499 }
500 }
501 }
502 }
503 }
504 },
505 '*' => {
506 loop {
508 match self.chars.peek() {
509 None => {
510 return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
511 }
512 Some((_peeked_idx, peeked_char)) => {
513 match peeked_char {
514 '*' => {
515 self.advance();
516 let maybe_next_next = self.chars.peek();
517 match maybe_next_next {
518 None => {
519 return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
520 },
521 Some((_next_peeked_idx, next_peeked_char)) => {
522 match next_peeked_char {
523 '/' => {
524 (last_idx, _) = self.advance().unwrap();
525 return Ok((start_idx, TokType::BlockComment, last_idx))
526 }
527 _ => {
528 continue
529 }
530 }
531 }
532 }
533 }
534 _ => {
535 self.advance();
536 continue
537 }
538 }
539 }
540 }
541 }
542 }
543 _ => unreachable!("Invalid second comment char")
544 }
545 }
546
547 fn next_token(&mut self) -> Result<TokenSpan, TokenizationError> {
548 let maybe_last = self.lookahead;
549 let maybe_next = self.advance();
550 match maybe_next {
551 None => {
552 match maybe_last {
553 Some((last_idx, last_char)) => Ok((last_idx + last_char.len_utf8(), TokType::EOF, last_idx + last_char.len_utf8())),
554 None => Ok((0, TokType::EOF, 0)),
555 }
556 }
557 Some((next_idx, next)) => {
558 match next {
559 '{' => Ok((next_idx, TokType:: LeftBrace, next_idx + 1)),
560 '}' => Ok((next_idx, TokType:: RightBrace, next_idx + 1)),
561 '[' => Ok((next_idx, TokType:: LeftBracket, next_idx + 1)),
562 ']' => Ok((next_idx, TokType:: RightBracket, next_idx + 1)),
563 ',' => Ok((next_idx, TokType:: Comma, next_idx + 1)),
564 ':' => Ok((next_idx, TokType:: Colon, next_idx + 1)),
565 '+' => Ok((next_idx, TokType:: Plus, next_idx + 1)),
566 '-' => Ok((next_idx, TokType:: Minus, next_idx + 1)),
567 '\'' | '"' => self.process_string(),
568 '.' => self.process_number(),
569 '\u{FEFF}' => {
570 let whitespace_tok = self.process_whitespace()?;
571 if self.configuration.include_whitespace {
572 Ok(whitespace_tok)
573 } else {
574 self.next_token()
575 }
576 }
577 c if c.is_whitespace() => {
578 let whitespace_tok = self.process_whitespace()?;
579 if self.configuration.include_whitespace {
580 Ok(whitespace_tok)
581 } else {
582 self.next_token()
583 }
584 },
585 c if c.is_ascii_digit() => self.process_number(),
586 '/' => {
587 let (_, next_next) = self.chars.peek().unwrap_or(&(usize::MAX, '!'));
588 match next_next {
589 '/' | '*' => {
590 if self.configuration.include_comments {
591 self.process_comment()
592 } else {
593 self.process_comment()?;
594 self.next_token()
595 }
596 },
597 _ => {
598 return Err(self.make_error("unexpected token '/'".to_string(), next_idx))
599 }
600 }
601 }
602 _ => self.process_identifier_or_const()
603 }
604 }
605 }
606 }
607
608 pub(crate) fn tokenize(&mut self) -> Result<Tokens<'input>, TokenizationError> {
609 let mut tokens: Vec<TokenSpan> = Vec::new();
610 loop {
611 let tok = self.next_token()?;
612 if tok.1 == TokType::EOF {
613 tokens.push(tok);
614 break
615 } else {
616 tokens.push(tok);
617 }
618 }
619 Ok(Tokens{ tok_spans: tokens, source: self.text})
620 }
621}
622
623impl<'input> Iterator for Tokenizer<'input> {
624 type Item = Result<TokenSpan, TokenizationError>;
625 fn next(&mut self) -> Option<Self::Item> {
626 match self.next_token() {
627 Ok(span) => {
628 match span.1 {
629 TokType::EOF => {
630 None
631 }
632 _ => Some(Ok(span))
633 }
634 }
635 Err(e) => {
636 Some(Err(e))
637 }
638 }
639 }
640}
641
642
643pub fn tokenize_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
648 Tokenizer::new(text).tokenize()
649}
650
651pub fn tokenize_rt_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
653 let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
654 Tokenizer::with_configuration(text, config).tokenize()
655}
656
657pub fn tokenize_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
659 let maybe_text = std::str::from_utf8(bytes);
660 match maybe_text {
661 Ok(text) => {
662 Tokenizer::new(text).tokenize()
663 }
664 Err(e) => {
665 let valid_point = e.valid_up_to();
666 if valid_point > 0 {
667 let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
668 let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
669 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
670 } else {
671 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
672 }
673 }
674 }
675}
676
677pub fn tokenize_rt_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
679 let maybe_text = std::str::from_utf8(bytes);
680 match maybe_text {
681 Ok(text) => {
682 let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
683 Tokenizer::with_configuration(text, config).tokenize()
684 }
685 Err(e) => {
686 let valid_point = e.valid_up_to();
687 if valid_point > 0 {
688 let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
689 let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
690 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
691 } else {
692 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
693 }
694 }
695 }
696}
697
698
699
700#[cfg(test)]
701mod test {
702 use crate::tokenize::TokType::*;
703 use super::*;
704 #[test]
705 fn test_foo() {
706 let text = "";
707 let toks = tokenize_str(text).unwrap();
708 let expected = Tokens{ tok_spans: vec![(0, EOF, 0)], source: text};
709 assert_eq!(toks, expected);
710 }
711
712 #[test]
713 fn test_heck() {
714 let text = "{}";
715 let toks = tokenize_str(text).unwrap();
716 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, RightBrace, 2), (2, EOF, 2)], source: text};
717 assert_eq!(toks, expected);
718 }
719
720 #[test]
721 fn test_heck2() {
722 let text = "{\"foo\":\"bar\"}";
723 let toks = tokenize_str(text).unwrap();
724 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
725 assert_eq!(toks, expected)
726 }
727 #[test]
728 fn test_heck3() {
729 let text = "{\"foo\":\"bar\"}";
730 let toks = tokenize_rt_str(text).unwrap();
731 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
732 assert_eq!(toks, expected)
733 }
734
735
736 #[test]
737 fn test_single_quoted_string() {
738 let text = "{'foo':'bar'}";
739 let toks = tokenize_str(text).unwrap();
740 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, SingleQuotedString, 6), (6, Colon, 7), (7, SingleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
741 assert_eq!(toks, expected);
742 }
743
744 #[test]
745 fn test_array() {
746 let text = "[1,2,3]";
747 let toks = tokenize_str(text).unwrap();
748 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Integer, 2), (2, Comma, 3), (3, Integer, 4), (4, Comma, 5), (5, Integer, 6), (6, RightBracket, 7), (7, EOF, 7)], source: text};
749 assert_eq!(toks, expected);
750 }
751
752 #[test]
753 fn test_float_number() {
754 let text = "[1.23,4.56]";
755 let toks = tokenize_str(text).unwrap();
756 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Float, 5), (5, Comma, 6), (6, Float, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
757 assert_eq!(toks, expected);
758 }
759
760 #[test]
761 fn test_exponent_number() {
762 let text = "[1e10,2e-5]";
763 let toks = tokenize_str(text).unwrap();
764 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Exponent, 5), (5, Comma, 6), (6, Exponent, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
765 assert_eq!(toks, expected);
766 }
767
768 #[test]
769 fn test_whitespace() {
770 let text = " {\n\t} ";
771 let toks = Tokenizer::with_configuration(text, TokenizerConfig{include_whitespace: true, include_comments: true, allow_octal: false}).tokenize().unwrap();
772 let expected = Tokens{ tok_spans: vec![(0, Whitespace, 1), (1, LeftBrace, 2), (2, Whitespace, 4), (4, RightBrace, 5), (5, Whitespace, 6), (6, EOF, 6)], source: text};
773 assert_eq!(toks, expected);
774 }
775
776 #[test]
777 fn test_true_false_null() {
778 let text = "[true,false,null]";
779 let toks = tokenize_str(text).unwrap();
780 let expected = Tokens{source: text, tok_spans: vec![(0, LeftBracket, 1), (1, True, 5), (5, Comma, 6), (6, False, 11), (11, Comma, 12), (12, Null, 16), (16, RightBracket, 17), (17, EOF, 17)]};
781 assert_eq!(toks, expected);
782 }
783
784 #[test]
785 fn test_number() {
786 let text = "123";
787 let toks = tokenize_str(text).unwrap();
788 let expected = Tokens{source: text, tok_spans: vec![(0, Integer, 3), (3, EOF, 3)]};
789 assert_eq!(toks, expected);
790
791 }
792
793 #[test]
794 fn test_unexpected_symbol() {
795 let text = "1!2";
796 tokenize_str(text).unwrap_err();
797 }
798
799 #[test]
800 fn test_special_things() {
801 let text = r#"{$_:1,_$:2,a\u200C:3}"#;
802 let toks = tokenize_str(text).unwrap();
803 let expected = Tokens{source: text, tok_spans: vec![(0, LeftBrace, 1), (1, Name, 3), (3, Colon, 4), (4, Integer, 5), (5, Comma, 6), (6, Name, 8), (8, Colon, 9), (9, Integer, 10), (10, Comma, 11), (11, Name, 18), (18, Colon, 19), (19, Integer, 20), (20, RightBrace, 21), (21, EOF, 21)]};
804 assert_eq!(toks, expected)
805 }
806
807 #[test]
808 fn test_eof_after_multibyte() {
809 let text = r#"ë"#;
810 let toks = tokenize_str(text).unwrap();
811 let expected = Tokens{source: text, tok_spans: vec![(0, Name, 2), (2, EOF, 2)]};
812 assert_eq!(toks, expected)
813
814 }
815}