1use std::fmt::{Display, Formatter};
2use std::iter::{Peekable};
3use std::str::{CharIndices};
4use crate::utils::get_line_col_char;
5
6#[derive(PartialEq, Clone, Debug)]
7pub enum TokType {
8 LeftBrace,
9 RightBrace,
10 LeftBracket,
11 RightBracket,
12 Comma,
13 Colon,
14 Name,
15 SingleQuotedString,
16 DoubleQuotedString,
17 BlockComment,
18 LineComment,
19 Whitespace,
20 True,
21 False,
22 Null,
23 Integer,
24 Float,
25 Infinity,
26 Nan,
27 Exponent,
28 Hexadecimal,
29 Plus,
31 Minus,
32 EOF,
33}
34
35
36pub(crate) type TokenSpan = (usize, TokType, usize);
38
39
40
41
42#[derive(Debug, PartialEq)]
43pub struct Tokens<'input> {
44 pub tok_spans: Vec<TokenSpan>,
45 pub(crate) source: &'input str
46}
47
48
49#[derive(Debug)]
50pub struct TokenizationError {
51 pub message: String,
52 pub index: usize, pub lineno: usize,
54 pub colno: usize,
55 pub char_index: usize }
57
58impl<'input> Display for TokenizationError {
59 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
60 write!(f, "TokenizationError: {}: line {} column {} (char {})", self.message, self.lineno, self.colno, self.char_index)
61 }
62}
63
64#[derive(Debug)]
65pub(crate) struct Tokenizer<'input> {
66 configuration: TokenizerConfig,
67 text: &'input str,
68 chars: Peekable<CharIndices<'input>>,
69 lookahead: Option<(usize, char)>,
70}
71
72
73const HEX_CHARS: &str = "0123456789abcdefABCDEF";
74const IDENTIFIER_START_SYMBOLS: &str = "$_";
75const IDENTIFIER_PARTS: &str = "$_\u{200C}\u{200D}\u{005F}\u{203F}\u{2040}\u{2054}\u{FE33}\u{FE34}\u{FE4D}\u{FE4E}\u{FE4F}\u{FF3F}";
76#[derive(Debug)]
77pub struct TokenizerConfig {
78 pub include_whitespace: bool,
79 pub include_comments: bool,
80 pub allow_octal: bool,
81}
82
83impl Default for TokenizerConfig {
84 fn default() -> Self {
85 Self::new()
86 }
87}
88
89impl TokenizerConfig {
90 pub fn new() -> Self {
91 TokenizerConfig {include_whitespace: false, include_comments: false, allow_octal: false}
92 }
93}
94
95impl <'input> Tokenizer<'input> {
96 pub fn new(text: &'input str) -> Self {
97 Tokenizer {configuration: TokenizerConfig::new(), text, chars: text.char_indices().peekable(), lookahead: None}
98 }
99
100 pub fn with_configuration(text: &'input str, configuration: TokenizerConfig) -> Self {
101 Tokenizer {configuration, text, chars: text.char_indices().peekable(), lookahead: None}
102 }
103
104 fn advance(&mut self) -> Option<(usize, char)> {
105 self.lookahead = self.chars.next();
106 self.lookahead
107 }
108
109 fn make_error(&self, message: String, start_index: usize) -> TokenizationError {
110 let (lineno, colno, char_index) = get_line_col_char(self.text, start_index);
111 TokenizationError{message, index: start_index, lineno, colno, char_index}
112 }
113
114 fn process_string(&mut self) -> Result<TokenSpan, TokenizationError> {
115 let (start_idx, quote_char) = self.lookahead.expect("Expected quote character");
116
117 let string_type: TokType = match quote_char {
118 '"' => TokType::DoubleQuotedString,
119 '\'' => TokType::SingleQuotedString,
120 _ => unreachable!("Expected quote character, but got {:?}", quote_char)
121 };
122
123 let mut last_char = quote_char;
124
125 let mut escaping = false;
126 loop {
127 match self.advance() {
128 None => {
129 break Err(self.make_error("Unterminated string starting at".to_string(), start_idx))
130 },
131 Some((idx, char)) => {
132 match char {
133 '\\' => {
134 escaping = !escaping;
135 last_char = char;
136 continue
137 }
138 '\n' => {
139 if !escaping && last_char != '\r' {
140 break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
141 }
142 escaping = false;
143 last_char = char;
144 continue
145 }
146 '\r' | '\u{2028}' | '\u{2029}' => {
147 if !escaping {
148 break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
149 }
150 escaping = false;
151 last_char = char;
152 continue
153 },
154 c if c == quote_char && !escaping => {
155 break Ok((start_idx, string_type, idx+1))
156 },
157 _ => {
158 escaping = false;
159 last_char = char;
160 continue
161 }
162 }
163 }
164 }
165 }
166 }
167
168 fn process_whitespace(&mut self) -> Result<TokenSpan, TokenizationError> {
169 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting whitespace char");
170 let mut last_index = start_idx;
171 let mut last_char = start_char;
172 loop {
173 match self.chars.peek() {
174 None => break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8())),
175 Some((peeked_idx, peeked_char)) => {
176 if peeked_char.is_whitespace() {
177 last_index = *peeked_idx;
178 last_char = *peeked_char;
179 self.advance();
180 continue
181 } else {
182 break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8()))
183 }
184 }
185 }
186 }
187 }
188
189 fn process_octal(&mut self) -> Result<TokenSpan, TokenizationError> {
190 let (start_idx, _start_char) = self.lookahead.expect("Unexpected end of input, was processing octal");
191 if self.configuration.allow_octal {
192 todo!()
193 } else {
194 Err(self.make_error("Octal literals are forbidden".to_string(), start_idx))
195 }
196 }
197
198 fn process_hexadecimal(&mut self) -> Result<TokenSpan, TokenizationError> {
199 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
200 let (_, x_char) = self.advance().expect("Expected hex x");
201 assert_eq!(start_char, '0');
202 if x_char != 'x' && x_char != 'X' {
203 unreachable!("Invalid hexadecimal here")
204 }
205
206 match self.advance() {
207 None => {
208 Err(self.make_error("Expected at least one digit in hexadecimal literal".to_string(), start_idx))
209 }
210 Some((mut last_idx, first_digit)) => {
211 if !HEX_CHARS.contains(first_digit) {
212 return Err(self.make_error(format!("Invalid hexadecimal character {first_digit:?} in literal starting at"), start_idx))
213 }
214 loop {
215 match self.chars.peek() {
216 None => break Ok((start_idx, TokType::Hexadecimal, last_idx+1)),
217 Some((offset, char)) => {
218 if !HEX_CHARS.contains(*char) {
219 break Ok((start_idx, TokType::Hexadecimal, last_idx+1))
220 }
221 last_idx = *offset;
222 self.advance();
223 continue
224 }
225 }
226 }
227 }
228 }
229 }
230
231 fn process_number(&mut self) -> Result<TokenSpan, TokenizationError>{
232 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
233 let mut last_index = start_idx;
234 let mut decimal_seen: bool = false;
235 let mut exponent_seen: bool = false;
236 let mut unary_seen: bool = false;
237 if start_char == '.' {
238 decimal_seen = true
239 }
240
241 let maybe_second_char = self.chars.peek();
242 match maybe_second_char {
243 None => {
244 if decimal_seen {
245 return Err(self.make_error("Lone decimal is an invalid literal".to_string(), start_idx))
246 }
247 return Ok((start_idx, TokType::Integer, start_idx + 1))
248 },
249 Some((_second_idx, second_char)) if start_char == '0' => {
250 match second_char {
251 'x' | 'X' => {return self.process_hexadecimal()}
252 sc if sc.is_ascii_digit() => {
253 return self.process_octal()
254 },
255 _ => {}
256 }
257 }
258 _ => {}
259 }
260
261 loop {
262 match self.chars.peek() {
263 None => {
264 if unary_seen || exponent_seen {
265 let (_, last_char) = self.lookahead.unwrap();
266 if "+-eE".contains(last_char) {
267 return Err(self.make_error(format!("Invalid number literal (missing digit after {last_char:?})"), start_idx))
268 }
269 }
270 if exponent_seen {
271 break Ok((start_idx, TokType::Exponent, last_index+1))
272 } else if decimal_seen {
273 if start_idx == last_index {
274 return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
275 }
276 break Ok((start_idx, TokType::Float, last_index+1))
277 } else {
278 break Ok((start_idx, TokType::Integer, last_index+1))
279 }
280 },
281 Some((next_idx, next_char)) => {
282 match *next_char {
283 c if c.is_ascii_digit() => {
284 last_index = *next_idx;
285 self.advance();
286 continue
287 },
288 '.' => {
289 if decimal_seen {
290 return Err(self.make_error("Invalid number literal (unexpected decimal)".to_string(), start_idx))
291 }
292 decimal_seen = true;
293 if exponent_seen {
294 return Err(self.make_error("Invalid exponent literal (float exponents forbidden) at".to_string(), start_idx))
295 }
296 last_index = *next_idx;
297 self.advance();
298 continue
299 },
300 'e' | 'E' => {
301 if exponent_seen {
302 return Err(self.make_error("Invalid number literal (only one exponent part is allowed)".to_string(), start_idx))
303 }
304 exponent_seen = true;
305 last_index = *next_idx;
306 self.advance();
307 }
308 '+' | '-' => {
309 let (_, previous_char) = self.lookahead.unwrap();
310 unary_seen = true;
311 match previous_char {
312 'e' | 'E' => {
313 last_index = *next_idx;
314 self.advance();
315 }
316 _ => {
317 return Err(self.make_error("Unary within number literal only allowed after exponent part".to_string(), start_idx))
318 }
319 }
320 }
321 _ => {
322 if unary_seen || exponent_seen {
325 let (_, last_char) = self.lookahead.unwrap();
326 if "+-eE".contains(last_char) {
327 return Err(self.make_error(format!("Invalid number literal (missing digit after {last_char:?})"), start_idx))
328 }
329 }
330 if exponent_seen {
331 break Ok((start_idx, TokType::Exponent, last_index+1))
332 } else if decimal_seen {
333 if start_idx == last_index {
334 return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
335 }
336 break Ok((start_idx, TokType::Float, last_index+1))
337 } else {
338 break Ok((start_idx, TokType::Integer, last_index+1))
339 }
340 }
341 }
342 }
343 }
344 }
345
346 }
347
348 fn tok_from_indices(&self, start: usize, end: usize) -> Result<TokenSpan, TokenizationError> {
349 let lexeme= &self.text[start .. end];
350 match lexeme {
351 "true" => Ok((start, TokType::True, end)),
352 "false" => Ok((start, TokType::False, end)),
353 "NaN" => Ok((start, TokType::Nan, end)),
354 "Infinity" => Ok((start, TokType::Infinity, end)),
355 "null" => Ok((start, TokType::Null, end)),
356 _ => {
357 Ok((start, TokType::Name, end))
358 }
359 }
360 }
361
362 fn process_identifier_or_const(&mut self) -> Result<TokenSpan, TokenizationError> {
363 use crate::utils::read_hex_digits;
364 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting identifier/const char");
365 let mut last_idx = start_idx;
366 use unicode_general_category::{get_general_category, GeneralCategory};
367 match start_char {
368 c if c.is_alphabetic() => {}
369 c if IDENTIFIER_START_SYMBOLS.contains(c) => {}
370 '\\' => {
371 match self.chars.peek() {
372 None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
373 Some((_, c)) => {
374 match c {
375 'u' => {
376 let mut ubuffer = String::with_capacity(4);
377 self.advance();
378 for _ in 0..4 {
379 match self.advance() {
380 None => {
381 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
382 }
383 Some((idx, c)) => {
384 ubuffer.push(c);
385 last_idx = idx;
386 if !HEX_CHARS.contains(c) {
387 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
388 }
389 }
390 }
391 }
392 let maybe_hex_val = read_hex_digits(&mut ubuffer.chars().peekable(), 4, ubuffer.as_str());
393 match maybe_hex_val {
394 Err(_) => {
395 return Err(self.make_error(format!("invalid unicode escape: \\u{ubuffer}"), start_idx))
396 }
397 Ok(hex_val) => {
398 let maybe_c = char::from_u32(hex_val);
399 match maybe_c {
400 None => {
401 return Err(self.make_error(format!("invalid unicode escape value: \\u{ubuffer}"), start_idx))
402 }
403 Some(c) => {
404 if !c.is_alphabetic() && !IDENTIFIER_START_SYMBOLS.contains(c) {
405 return Err(self.make_error(format!("Illegal identifier start from unicode escape sequence: \\u{ubuffer}"), start_idx))
406 }
407 }
408 }
409 }
410 }
411 }
412 _ => {
413 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
414 }
415 }
416 }
417 }
418 }
419 _ => {
420 return Err(self.make_error(format!("Invalid character {start_char}"), start_idx))
421 }
422 }
423 let mut last_char = start_char;
424 loop {
425 match self.chars.peek() {
426 None => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8()),
427 Some((next_idx, next_char)) => {
428 if next_char.is_whitespace() {
429 break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
430 } else if next_char.is_alphanumeric() {
431 last_idx = *next_idx;
432 last_char = *next_char;
433 self.advance();
434 continue
435 } else if IDENTIFIER_PARTS.contains(*next_char) {
436 last_idx = *next_idx;
437 last_char = *next_char;
438 self.advance();
439 continue
440 } else if *next_char == '\\' {
441 self.advance();
442 match self.advance() {
443 None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
444 Some((_, c)) => {
445 match c {
446 'u' => {
447 for _ in 0..4 {
448 match self.advance() {
449 None => {
450 return Err(self.make_error("Invalid unquoted key1".to_string(), start_idx))
451 }
452 Some((_, c)) => {
453 if !HEX_CHARS.contains(c) {
454
455 return Err(self.make_error("Invalid unquoted key2".to_string(), start_idx))
456 }
457
458 }
459 }
460 }
461 (last_idx, last_char) = self.lookahead.unwrap()
462 }
463 _ => {
464 return Err(self.make_error("Invalid unquoted key3".to_string(), start_idx))
465 }
466 }
467 }
468 }
469 } else {
470 match get_general_category(*next_char) {
471 GeneralCategory::NonspacingMark | GeneralCategory::SpacingMark => {
472 last_idx = *next_idx;
473 last_char = *next_char;
474 self.advance();
475 continue
476 }
477 _ => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
478 }
479 }
480 }
481 }
482 }
483 }
484
485 fn process_comment(&mut self) -> Result<TokenSpan, TokenizationError> {
486 let (start_idx, _char) = self.lookahead.expect("Expected comment start");
487 let (mut last_idx, star_or_slash) = self.advance().expect("Expected second comment char");
488 match star_or_slash {
489 '/' => {
490 loop {
492 match self.chars.peek() {
493 None => {
494 return Ok((start_idx, TokType::LineComment, last_idx+1))
495 },
496 Some((peeked_idx, peeked_char)) => {
497 match peeked_char {
498 '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
499 (last_idx, _) = self.advance().unwrap();
500 return Ok((start_idx, TokType::LineComment, last_idx+1))
501 }
502 _ => {
503 last_idx = *peeked_idx;
504 self.advance();
505 }
506 }
507 }
508 }
509 }
510 },
511 '*' => {
512 loop {
514 match self.chars.peek() {
515 None => {
516 return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
517 }
518 Some((_peeked_idx, peeked_char)) => {
519 match peeked_char {
520 '*' => {
521 self.advance();
522 let maybe_next_next = self.chars.peek();
523 match maybe_next_next {
524 None => {
525 return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
526 },
527 Some((_next_peeked_idx, next_peeked_char)) => {
528 match next_peeked_char {
529 '/' => {
530 (last_idx, _) = self.advance().unwrap();
531 return Ok((start_idx, TokType::BlockComment, last_idx))
532 }
533 _ => {
534 continue
535 }
536 }
537 }
538 }
539 }
540 _ => {
541 self.advance();
542 continue
543 }
544 }
545 }
546 }
547 }
548 }
549 _ => unreachable!("Invalid second comment char")
550 }
551 }
552
553 fn next_token(&mut self) -> Result<TokenSpan, TokenizationError> {
554 let maybe_last = self.lookahead;
555 let maybe_next = self.advance();
556 match maybe_next {
557 None => {
558 match maybe_last {
559 Some((last_idx, last_char)) => Ok((last_idx + last_char.len_utf8(), TokType::EOF, last_idx + last_char.len_utf8())),
560 None => Ok((0, TokType::EOF, 0)),
561 }
562 }
563 Some((next_idx, next)) => {
564 match next {
565 '{' => Ok((next_idx, TokType:: LeftBrace, next_idx + 1)),
566 '}' => Ok((next_idx, TokType:: RightBrace, next_idx + 1)),
567 '[' => Ok((next_idx, TokType:: LeftBracket, next_idx + 1)),
568 ']' => Ok((next_idx, TokType:: RightBracket, next_idx + 1)),
569 ',' => Ok((next_idx, TokType:: Comma, next_idx + 1)),
570 ':' => Ok((next_idx, TokType:: Colon, next_idx + 1)),
571 '+' => Ok((next_idx, TokType:: Plus, next_idx + 1)),
572 '-' => Ok((next_idx, TokType:: Minus, next_idx + 1)),
573 '\'' | '"' => self.process_string(),
574 '.' => self.process_number(),
575 '\u{FEFF}' => {
576 let whitespace_tok = self.process_whitespace()?;
577 if self.configuration.include_whitespace {
578 Ok(whitespace_tok)
579 } else {
580 self.next_token()
581 }
582 }
583 c if c.is_whitespace() => {
584 let whitespace_tok = self.process_whitespace()?;
585 if self.configuration.include_whitespace {
586 Ok(whitespace_tok)
587 } else {
588 self.next_token()
589 }
590 },
591 c if c.is_ascii_digit() => self.process_number(),
592 '/' => {
593 let (_, next_next) = self.chars.peek().unwrap_or(&(usize::MAX, '!'));
594 match next_next {
595 '/' | '*' => {
596 if self.configuration.include_comments {
597 self.process_comment()
598 } else {
599 self.process_comment()?;
600 self.next_token()
601 }
602 },
603 _ => {
604 Err(self.make_error("unexpected token '/'".to_string(), next_idx))
605 }
606 }
607 }
608 _ => self.process_identifier_or_const()
609 }
610 }
611 }
612 }
613
614 pub(crate) fn tokenize(&mut self) -> Result<Tokens<'input>, TokenizationError> {
615 let mut tokens: Vec<TokenSpan> = Vec::new();
616 loop {
617 let tok = self.next_token()?;
618 if tok.1 == TokType::EOF {
619 tokens.push(tok);
620 break
621 } else {
622 tokens.push(tok);
623 }
624 }
625 Ok(Tokens{ tok_spans: tokens, source: self.text})
626 }
627}
628
629impl<'input> Iterator for Tokenizer<'input> {
630 type Item = Result<TokenSpan, TokenizationError>;
631 fn next(&mut self) -> Option<Self::Item> {
632 match self.next_token() {
633 Ok(span) => {
634 match span.1 {
635 TokType::EOF => {
636 None
637 }
638 _ => Some(Ok(span))
639 }
640 }
641 Err(e) => {
642 Some(Err(e))
643 }
644 }
645 }
646}
647
648
649pub fn tokenize_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
654 Tokenizer::new(text).tokenize()
655}
656
657pub fn tokenize_rt_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
659 let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
660 Tokenizer::with_configuration(text, config).tokenize()
661}
662
663pub fn tokenize_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
665 let maybe_text = std::str::from_utf8(bytes);
666 match maybe_text {
667 Ok(text) => {
668 Tokenizer::new(text).tokenize()
669 }
670 Err(e) => {
671 let valid_point = e.valid_up_to();
672 if valid_point > 0 {
673 let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
674 let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
675 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
676 } else {
677 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
678 }
679 }
680 }
681}
682
683pub fn tokenize_rt_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
685 let maybe_text = std::str::from_utf8(bytes);
686 match maybe_text {
687 Ok(text) => {
688 let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
689 Tokenizer::with_configuration(text, config).tokenize()
690 }
691 Err(e) => {
692 let valid_point = e.valid_up_to();
693 if valid_point > 0 {
694 let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
695 let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
696 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
697 } else {
698 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
699 }
700 }
701 }
702}
703
704
705
706#[cfg(test)]
707mod test {
708 use crate::tokenize::TokType::*;
709 use super::*;
710 #[test]
711 fn test_foo() {
712 let text = "";
713 let toks = tokenize_str(text).unwrap();
714 let expected = Tokens{ tok_spans: vec![(0, EOF, 0)], source: text};
715 assert_eq!(toks, expected);
716 }
717
718 #[test]
719 fn test_heck() {
720 let text = "{}";
721 let toks = tokenize_str(text).unwrap();
722 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, RightBrace, 2), (2, EOF, 2)], source: text};
723 assert_eq!(toks, expected);
724 }
725
726 #[test]
727 fn test_heck2() {
728 let text = "{\"foo\":\"bar\"}";
729 let toks = tokenize_str(text).unwrap();
730 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
731 assert_eq!(toks, expected)
732 }
733 #[test]
734 fn test_heck3() {
735 let text = "{\"foo\":\"bar\"}";
736 let toks = tokenize_rt_str(text).unwrap();
737 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
738 assert_eq!(toks, expected)
739 }
740
741
742 #[test]
743 fn test_single_quoted_string() {
744 let text = "{'foo':'bar'}";
745 let toks = tokenize_str(text).unwrap();
746 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, SingleQuotedString, 6), (6, Colon, 7), (7, SingleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
747 assert_eq!(toks, expected);
748 }
749
750 #[test]
751 fn test_array() {
752 let text = "[1,2,3]";
753 let toks = tokenize_str(text).unwrap();
754 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Integer, 2), (2, Comma, 3), (3, Integer, 4), (4, Comma, 5), (5, Integer, 6), (6, RightBracket, 7), (7, EOF, 7)], source: text};
755 assert_eq!(toks, expected);
756 }
757
758 #[test]
759 fn test_float_number() {
760 let text = "[1.23,4.56]";
761 let toks = tokenize_str(text).unwrap();
762 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Float, 5), (5, Comma, 6), (6, Float, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
763 assert_eq!(toks, expected);
764 }
765
766 #[test]
767 fn test_exponent_number() {
768 let text = "[1e10,2e-5]";
769 let toks = tokenize_str(text).unwrap();
770 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Exponent, 5), (5, Comma, 6), (6, Exponent, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
771 assert_eq!(toks, expected);
772 }
773
774 #[test]
775 fn test_whitespace() {
776 let text = " {\n\t} ";
777 let toks = Tokenizer::with_configuration(text, TokenizerConfig{include_whitespace: true, include_comments: true, allow_octal: false}).tokenize().unwrap();
778 let expected = Tokens{ tok_spans: vec![(0, Whitespace, 1), (1, LeftBrace, 2), (2, Whitespace, 4), (4, RightBrace, 5), (5, Whitespace, 6), (6, EOF, 6)], source: text};
779 assert_eq!(toks, expected);
780 }
781
782 #[test]
783 fn test_true_false_null() {
784 let text = "[true,false,null]";
785 let toks = tokenize_str(text).unwrap();
786 let expected = Tokens{source: text, tok_spans: vec![(0, LeftBracket, 1), (1, True, 5), (5, Comma, 6), (6, False, 11), (11, Comma, 12), (12, Null, 16), (16, RightBracket, 17), (17, EOF, 17)]};
787 assert_eq!(toks, expected);
788 }
789
790 #[test]
791 fn test_number() {
792 let text = "123";
793 let toks = tokenize_str(text).unwrap();
794 let expected = Tokens{source: text, tok_spans: vec![(0, Integer, 3), (3, EOF, 3)]};
795 assert_eq!(toks, expected);
796
797 }
798
799 #[test]
800 fn test_unexpected_symbol() {
801 let text = "1!2";
802 tokenize_str(text).unwrap_err();
803 }
804
805 #[test]
806 fn test_special_things() {
807 let text = r#"{$_:1,_$:2,a\u200C:3}"#;
808 let toks = tokenize_str(text).unwrap();
809 let expected = Tokens{source: text, tok_spans: vec![(0, LeftBrace, 1), (1, Name, 3), (3, Colon, 4), (4, Integer, 5), (5, Comma, 6), (6, Name, 8), (8, Colon, 9), (9, Integer, 10), (10, Comma, 11), (11, Name, 18), (18, Colon, 19), (19, Integer, 20), (20, RightBrace, 21), (21, EOF, 21)]};
810 assert_eq!(toks, expected)
811 }
812
813 #[test]
814 fn test_eof_after_multibyte() {
815 let text = r#"ë"#;
816 let toks = tokenize_str(text).unwrap();
817 let expected = Tokens{source: text, tok_spans: vec![(0, Name, 2), (2, EOF, 2)]};
818 assert_eq!(toks, expected)
819
820 }
821}