1use std::fmt::{Display, Formatter};
2use std::iter::{Peekable};
3use std::str::{CharIndices};
4use crate::utils::get_line_col_char;
5
6#[derive(PartialEq, Clone, Debug)]
7pub enum TokType {
8 LeftBrace,
9 RightBrace,
10 LeftBracket,
11 RightBracket,
12 Comma,
13 Colon,
14 Name,
15 SingleQuotedString,
16 DoubleQuotedString,
17 BlockComment,
18 LineComment,
19 Whitespace,
20 True,
21 False,
22 Null,
23 Integer,
24 Float,
25 Infinity,
26 Nan,
27 Exponent,
28 Hexadecimal,
29 Plus,
31 Minus,
32 EOF,
33}
34
35
36pub(crate) type TokenSpan = (usize, TokType, usize);
38
39
40
41
42#[derive(Debug, PartialEq)]
43pub struct Tokens<'input> {
44 pub tok_spans: Vec<TokenSpan>,
45 pub(crate) source: &'input str
46}
47
48
49#[derive(Debug)]
50pub struct TokenizationError {
51 pub message: String,
52 pub index: usize, pub lineno: usize,
54 pub colno: usize,
55 pub char_index: usize }
57
58impl<'input> Display for TokenizationError {
59 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
60 write!(f, "TokenizationError: {}: line {} column {} (char {})", self.message, self.lineno, self.colno, self.char_index)
61 }
62}
63
64#[derive(Debug)]
65pub(crate) struct Tokenizer<'input> {
66 configuration: TokenizerConfig,
67 text: &'input str,
68 chars: Peekable<CharIndices<'input>>,
69 lookahead: Option<(usize, char)>,
70}
71
72
73const HEX_CHARS: &str = "0123456789abcdefABCDEF";
74const IDENTIFIER_START_SYMBOLS: &str = "$_";
75const IDENTIFIER_PARTS: &str = "$_\u{200C}\u{200D}\u{005F}\u{203F}\u{2040}\u{2054}\u{FE33}\u{FE34}\u{FE4D}\u{FE4E}\u{FE4F}\u{FF3F}";
76#[derive(Debug)]
77pub struct TokenizerConfig {
78 pub include_whitespace: bool,
79 pub include_comments: bool,
80 pub allow_octal: bool,
81}
82
83impl TokenizerConfig {
84 pub fn new() -> Self {
85 TokenizerConfig {include_whitespace: false, include_comments: false, allow_octal: false}
86 }
87}
88
89impl <'input> Tokenizer<'input> {
90 pub fn new(text: &'input str) -> Self {
91 Tokenizer {configuration: TokenizerConfig::new(), text: text, chars: text.char_indices().peekable(), lookahead: None}
92 }
93
94 pub fn with_configuration(text: &'input str, configuration: TokenizerConfig) -> Self {
95 Tokenizer {configuration: configuration, text: text, chars: text.char_indices().peekable(), lookahead: None}
96 }
97
98 fn advance(&mut self) -> Option<(usize, char)> {
99 self.lookahead = self.chars.next();
100 self.lookahead
101 }
102
103 fn make_error(&self, message: String, start_index: usize) -> TokenizationError {
104 let (lineno, colno, char_index) = get_line_col_char(self.text, start_index);
105 TokenizationError{message: message, index: start_index, lineno: lineno, colno: colno, char_index: char_index}
106 }
107
108 fn process_string(&mut self) -> Result<TokenSpan, TokenizationError> {
109 let (start_idx, quote_char) = self.lookahead.expect("Expected quote character");
110
111 let string_type: TokType = match quote_char {
112 '"' => TokType::DoubleQuotedString,
113 '\'' => TokType::SingleQuotedString,
114 _ => unreachable!("Expected quote character, but got {:?}", quote_char)
115 };
116
117 let mut last_char = quote_char;
118
119 let mut escaping = false;
120 loop {
121 match self.advance() {
122 None => {
123 break Err(self.make_error("Unterminated string starting at".to_string(), start_idx))
124 },
125 Some((idx, char)) => {
126 match char {
127 '\\' => {
128 escaping = !escaping;
129 last_char = char;
130 continue
131 }
132 '\n' => {
133 if !escaping && last_char != '\r' {
134 break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
135 }
136 escaping = false;
137 last_char = char;
138 continue
139 }
140 '\r' | '\u{2028}' | '\u{2029}' => {
141 if !escaping {
142 break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
143 }
144 escaping = false;
145 last_char = char;
146 continue
147 },
148 c if c == quote_char && !escaping => {
149 break Ok((start_idx, string_type, idx+1))
150 },
151 _ => {
152 escaping = false;
153 last_char = char;
154 continue
155 }
156 }
157 }
158 }
159 }
160 }
161
162 fn process_whitespace(&mut self) -> Result<TokenSpan, TokenizationError> {
163 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting whitespace char");
164 let mut last_index = start_idx;
165 let mut last_char = start_char;
166 loop {
167 match self.chars.peek() {
168 None => break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8())),
169 Some((peeked_idx, peeked_char)) => {
170 if peeked_char.is_whitespace() {
171 last_index = *peeked_idx;
172 last_char = *peeked_char;
173 self.advance();
174 continue
175 } else {
176 break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8()))
177 }
178 }
179 }
180 }
181 }
182
183 fn process_octal(&mut self) -> Result<TokenSpan, TokenizationError> {
184 let (start_idx, _start_char) = self.lookahead.expect("Unexpected end of input, was processing octal");
185 if self.configuration.allow_octal {
186 todo!()
187 } else {
188 Err(self.make_error("Octal literals are forbidden".to_string(), start_idx))
189 }
190 }
191
192 fn process_hexadecimal(&mut self) -> Result<TokenSpan, TokenizationError> {
193 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
194 let (_, x_char) = self.advance().expect("Expected hex x");
195 assert_eq!(start_char, '0');
196 if x_char != 'x' && x_char != 'X' {
197 unreachable!("Invalid hexadecimal here")
198 }
199
200 match self.advance() {
201 None => {
202 return Err(self.make_error("Expected at least one digit in hexadecimal literal".to_string(), start_idx))
203 }
204 Some((mut last_idx, first_digit)) => {
205 if !HEX_CHARS.contains(first_digit) {
206 return Err(self.make_error(format!("Invalid hexadecimal character {:?} in literal starting at", first_digit), start_idx))
207 }
208 loop {
209 match self.chars.peek() {
210 None => break Ok((start_idx, TokType::Hexadecimal, last_idx+1)),
211 Some((offset, char)) => {
212 if !HEX_CHARS.contains(*char) {
213 break Ok((start_idx, TokType::Hexadecimal, last_idx+1))
214 }
215 last_idx = *offset;
216 self.advance();
217 continue
218 }
219 }
220 }
221 }
222 }
223 }
224
225 fn process_number(&mut self) -> Result<TokenSpan, TokenizationError>{
226 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
227
228 let maybe_second_char = self.chars.peek();
229 match maybe_second_char {
230 None => return Ok((start_idx, TokType::Integer, start_idx + 1)),
231 Some((_second_idx, second_char)) if start_char == '0' => {
232 match second_char {
233 'x' | 'X' => {return self.process_hexadecimal()}
234 sc if sc.is_ascii_digit() => {
235 return self.process_octal()
236 },
237 _ => {}
238 }
239 }
240 _ => {}
241 }
242
243 let mut last_index = start_idx;
244 let mut decimal_seen: bool = false;
245 let mut exponent_seen: bool = false;
246 let mut unary_seen: bool = false;
247 match start_char {
248 '.' => {decimal_seen = true}
249 '+' | '-' => {unary_seen = true}
250 _ => {}
251 }
252 loop {
253 match self.chars.peek() {
254 None => {
255 if unary_seen || exponent_seen {
256 let (_, last_char) = self.lookahead.unwrap();
257 if "+-eE".contains(last_char) {
258 return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
259 }
260 }
261 if exponent_seen {
262 break Ok((start_idx, TokType::Exponent, last_index+1))
263 } else if decimal_seen {
264 if start_idx == last_index {
265 return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
266 }
267 break Ok((start_idx, TokType::Float, last_index+1))
268 } else {
269 break Ok((start_idx, TokType::Integer, last_index+1))
270 }
271 },
272 Some((next_idx, next_char)) => {
273 match *next_char {
274 c if c.is_ascii_digit() => {
275 last_index = *next_idx;
276 self.advance();
277 continue
278 },
279 '.' => {
280 if decimal_seen {
281 return Err(self.make_error("Invalid number literal (unexpected decimal)".to_string(), start_idx))
282 }
283 decimal_seen = true;
284 if exponent_seen {
285 return Err(self.make_error("Invalid exponent literal (float exponents forbidden) at".to_string(), start_idx))
286 }
287 last_index = *next_idx;
288 self.advance();
289 continue
290 },
291 'e' | 'E' => {
292 if exponent_seen {
293 return Err(self.make_error("Invalid number literal (only one exponent part is allowed)".to_string(), start_idx))
294 }
295 exponent_seen = true;
296 last_index = *next_idx;
297 self.advance();
298 }
299 '+' | '-' => {
300 let (_, previous_char) = self.lookahead.unwrap();
301 unary_seen = true;
302 match previous_char {
303 'e' | 'E' => {
304 last_index = *next_idx;
305 self.advance();
306 }
307 _ => {
308 return Err(self.make_error("Unary within number literal only allowed after exponent part".to_string(), start_idx))
309 }
310 }
311 }
312 _ => {
313 if unary_seen || exponent_seen {
316 let (_, last_char) = self.lookahead.unwrap();
317 if "+-eE".contains(last_char) {
318 return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
319 }
320 }
321 if exponent_seen {
322 break Ok((start_idx, TokType::Exponent, last_index+1))
323 } else if decimal_seen {
324 if start_idx == last_index {
325 return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
326 }
327 break Ok((start_idx, TokType::Float, last_index+1))
328 } else {
329 break Ok((start_idx, TokType::Integer, last_index+1))
330 }
331 }
332 }
333 }
334 }
335 }
336
337 }
338
339 fn tok_from_indices(&self, start: usize, end: usize) -> Result<TokenSpan, TokenizationError> {
340 let lexeme= &self.text[start .. end];
341 match lexeme {
342 "true" => Ok((start, TokType::True, end)),
343 "false" => Ok((start, TokType::False, end)),
344 "NaN" => Ok((start, TokType::Nan, end)),
345 "Infinity" => Ok((start, TokType::Infinity, end)),
346 "null" => Ok((start, TokType::Null, end)),
347 _ => {
348 Ok((start, TokType::Name, end))
349 }
350 }
351 }
352
353 fn process_identifier_or_const(&mut self) -> Result<TokenSpan, TokenizationError> {
354 use crate::utils::read_hex_digits;
355 let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting identifier/const char");
356 let mut last_idx = start_idx;
357 use unicode_general_category::{get_general_category, GeneralCategory};
358 match start_char {
359 c if c.is_alphabetic() => {}
360 c if IDENTIFIER_START_SYMBOLS.contains(c) => {}
361 '\\' => {
362 match self.chars.peek() {
363 None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
364 Some((_, c)) => {
365 match c {
366 'u' => {
367 let mut ubuffer = String::with_capacity(4);
368 self.advance();
369 for _ in 0..4 {
370 match self.advance() {
371 None => {
372 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
373 }
374 Some((idx, c)) => {
375 ubuffer.push(c);
376 last_idx = idx;
377 if !HEX_CHARS.contains(c) {
378 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
379 }
380 }
381 }
382 }
383 let maybe_hex_val = read_hex_digits(&mut ubuffer.chars().peekable(), 4, ubuffer.as_str());
384 match maybe_hex_val {
385 Err(_) => {
386 return Err(self.make_error(format!("invalid unicode escape: \\u{}", ubuffer), start_idx))
387 }
388 Ok(hex_val) => {
389 let maybe_c = char::from_u32(hex_val);
390 match maybe_c {
391 None => {
392 return Err(self.make_error(format!("invalid unicode escape value: \\u{}", ubuffer), start_idx))
393 }
394 Some(c) => {
395 if !c.is_alphabetic() && !IDENTIFIER_START_SYMBOLS.contains(c) {
396 return Err(self.make_error(format!("Illegal identifier start from unicode escape sequence: \\u{}", ubuffer), start_idx))
397 }
398 }
399 }
400 }
401 }
402 }
403 _ => {
404 return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
405 }
406 }
407 }
408 }
409 }
410 _ => {
411 return Err(self.make_error(format!("Invalid character {}", start_char), start_idx))
412 }
413 }
414 let mut last_char = start_char;
415 loop {
416 match self.chars.peek() {
417 None => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8()),
418 Some((next_idx, next_char)) => {
419 if next_char.is_whitespace() {
420 break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
421 } else if next_char.is_alphanumeric() {
422 last_idx = *next_idx;
423 last_char = *next_char;
424 self.advance();
425 continue
426 } else if IDENTIFIER_PARTS.contains(*next_char) {
427 last_idx = *next_idx;
428 last_char = *next_char;
429 self.advance();
430 continue
431 } else if *next_char == '\\' {
432 self.advance();
433 match self.advance() {
434 None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
435 Some((_, c)) => {
436 match c {
437 'u' => {
438 for _ in 0..4 {
439 match self.advance() {
440 None => {
441 return Err(self.make_error("Invalid unquoted key1".to_string(), start_idx))
442 }
443 Some((_, c)) => {
444 if !HEX_CHARS.contains(c) {
445
446 return Err(self.make_error("Invalid unquoted key2".to_string(), start_idx))
447 }
448
449 }
450 }
451 }
452 (last_idx, last_char) = self.lookahead.unwrap()
453 }
454 _ => {
455 return Err(self.make_error("Invalid unquoted key3".to_string(), start_idx))
456 }
457 }
458 }
459 }
460 } else {
461 match get_general_category(*next_char) {
462 GeneralCategory::NonspacingMark | GeneralCategory::SpacingMark => {
463 last_idx = *next_idx;
464 last_char = *next_char;
465 self.advance();
466 continue
467 }
468 _ => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
469 }
470 }
471 }
472 }
473 }
474 }
475
476 fn process_comment(&mut self) -> Result<TokenSpan, TokenizationError> {
477 let (start_idx, _char) = self.lookahead.expect("Expected comment start");
478 let (mut last_idx, star_or_slash) = self.advance().expect("Expected second comment char");
479 match star_or_slash {
480 '/' => {
481 loop {
483 match self.chars.peek() {
484 None => {
485 return Ok((start_idx, TokType::LineComment, last_idx+1))
486 },
487 Some((peeked_idx, peeked_char)) => {
488 match peeked_char {
489 '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
490 (last_idx, _) = self.advance().unwrap();
491 return Ok((start_idx, TokType::LineComment, last_idx+1))
492 }
493 _ => {
494 last_idx = *peeked_idx;
495 self.advance();
496 }
497 }
498 }
499 }
500 }
501 },
502 '*' => {
503 loop {
505 match self.chars.peek() {
506 None => {
507 return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
508 }
509 Some((_peeked_idx, peeked_char)) => {
510 match peeked_char {
511 '*' => {
512 self.advance();
513 let maybe_next_next = self.chars.peek();
514 match maybe_next_next {
515 None => {
516 return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
517 },
518 Some((_next_peeked_idx, next_peeked_char)) => {
519 match next_peeked_char {
520 '/' => {
521 (last_idx, _) = self.advance().unwrap();
522 return Ok((start_idx, TokType::BlockComment, last_idx))
523 }
524 _ => {
525 continue
526 }
527 }
528 }
529 }
530 }
531 _ => {
532 self.advance();
533 continue
534 }
535 }
536 }
537 }
538 }
539 }
540 _ => unreachable!("Invalid second comment char")
541 }
542 }
543
544 fn next_token(&mut self) -> Result<TokenSpan, TokenizationError> {
545 let maybe_last = self.lookahead;
546 let maybe_next = self.advance();
547 match maybe_next {
548 None => {
549 match maybe_last {
550 Some((last_idx, last_char)) => Ok((last_idx + last_char.len_utf8(), TokType::EOF, last_idx + last_char.len_utf8())),
551 None => Ok((0, TokType::EOF, 0)),
552 }
553 }
554 Some((next_idx, next)) => {
555 match next {
556 '{' => Ok((next_idx, TokType:: LeftBrace, next_idx + 1)),
557 '}' => Ok((next_idx, TokType:: RightBrace, next_idx + 1)),
558 '[' => Ok((next_idx, TokType:: LeftBracket, next_idx + 1)),
559 ']' => Ok((next_idx, TokType:: RightBracket, next_idx + 1)),
560 ',' => Ok((next_idx, TokType:: Comma, next_idx + 1)),
561 ':' => Ok((next_idx, TokType:: Colon, next_idx + 1)),
562 '+' => Ok((next_idx, TokType:: Plus, next_idx + 1)),
563 '-' => Ok((next_idx, TokType:: Minus, next_idx + 1)),
564 '\'' | '"' => self.process_string(),
565 '.' => self.process_number(),
566 '\u{FEFF}' => {
567 let whitespace_tok = self.process_whitespace()?;
568 if self.configuration.include_whitespace {
569 Ok(whitespace_tok)
570 } else {
571 self.next_token()
572 }
573 }
574 c if c.is_whitespace() => {
575 let whitespace_tok = self.process_whitespace()?;
576 if self.configuration.include_whitespace {
577 Ok(whitespace_tok)
578 } else {
579 self.next_token()
580 }
581 },
582 c if c.is_ascii_digit() => self.process_number(),
583 '/' => {
584 let (_, next_next) = self.chars.peek().unwrap_or(&(usize::MAX, '!'));
585 match next_next {
586 '/' | '*' => {
587 if self.configuration.include_comments {
588 self.process_comment()
589 } else {
590 self.process_comment()?;
591 self.next_token()
592 }
593 },
594 _ => {
595 return Err(self.make_error("unexpected token '/'".to_string(), next_idx))
596 }
597 }
598 }
599 _ => self.process_identifier_or_const()
600 }
601 }
602 }
603 }
604
605 pub(crate) fn tokenize(&mut self) -> Result<Tokens<'input>, TokenizationError> {
606 let mut tokens: Vec<TokenSpan> = Vec::new();
607 loop {
608 let tok = self.next_token()?;
609 if tok.1 == TokType::EOF {
610 tokens.push(tok);
611 break
612 } else {
613 tokens.push(tok);
614 }
615 }
616 Ok(Tokens{ tok_spans: tokens, source: self.text})
617 }
618}
619
620impl<'input> Iterator for Tokenizer<'input> {
621 type Item = Result<TokenSpan, TokenizationError>;
622 fn next(&mut self) -> Option<Self::Item> {
623 match self.next_token() {
624 Ok(span) => {
625 match span.1 {
626 TokType::EOF => {
627 None
628 }
629 _ => Some(Ok(span))
630 }
631 }
632 Err(e) => {
633 Some(Err(e))
634 }
635 }
636 }
637}
638
639
640pub fn tokenize_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
644 Tokenizer::new(text).tokenize()
645}
646
647pub fn tokenize_rt_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
648 let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
649 Tokenizer::with_configuration(text, config).tokenize()
650}
651
652pub fn tokenize_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
654 let maybe_text = std::str::from_utf8(bytes);
655 match maybe_text {
656 Ok(text) => {
657 Tokenizer::new(text).tokenize()
658 }
659 Err(e) => {
660 let valid_point = e.valid_up_to();
661 if valid_point > 0 {
662 let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
663 let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
664 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
665 } else {
666 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
667 }
668 }
669 }
670}
671
672pub fn tokenize_rt_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
673 let maybe_text = std::str::from_utf8(bytes);
674 match maybe_text {
675 Ok(text) => {
676 let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
677 Tokenizer::with_configuration(text, config).tokenize()
678 }
679 Err(e) => {
680 let valid_point = e.valid_up_to();
681 if valid_point > 0 {
682 let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
683 let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
684 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
685 } else {
686 Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
687 }
688 }
689 }
690}
691
692
693
694#[cfg(test)]
695mod test {
696 use crate::tokenize::TokType::*;
697 use super::*;
698 #[test]
699 fn test_foo() {
700 let text = "";
701 let toks = tokenize_str(text).unwrap();
702 let expected = Tokens{ tok_spans: vec![(0, EOF, 0)], source: text};
703 assert_eq!(toks, expected);
704 }
705
706 #[test]
707 fn test_heck() {
708 let text = "{}";
709 let toks = tokenize_str(text).unwrap();
710 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, RightBrace, 2), (2, EOF, 2)], source: text};
711 assert_eq!(toks, expected);
712 }
713
714 #[test]
715 fn test_heck2() {
716 let text = "{\"foo\":\"bar\"}";
717 let toks = tokenize_str(text).unwrap();
718 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
719 assert_eq!(toks, expected)
720 }
721 #[test]
722 fn test_heck3() {
723 let text = "{\"foo\":\"bar\"}";
724 let toks = tokenize_rt_str(text).unwrap();
725 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
726 assert_eq!(toks, expected)
727 }
728
729
730 #[test]
731 fn test_single_quoted_string() {
732 let text = "{'foo':'bar'}";
733 let toks = tokenize_str(text).unwrap();
734 let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, SingleQuotedString, 6), (6, Colon, 7), (7, SingleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
735 assert_eq!(toks, expected);
736 }
737
738 #[test]
739 fn test_array() {
740 let text = "[1,2,3]";
741 let toks = tokenize_str(text).unwrap();
742 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Integer, 2), (2, Comma, 3), (3, Integer, 4), (4, Comma, 5), (5, Integer, 6), (6, RightBracket, 7), (7, EOF, 7)], source: text};
743 assert_eq!(toks, expected);
744 }
745
746 #[test]
747 fn test_float_number() {
748 let text = "[1.23,4.56]";
749 let toks = tokenize_str(text).unwrap();
750 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Float, 5), (5, Comma, 6), (6, Float, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
751 assert_eq!(toks, expected);
752 }
753
754 #[test]
755 fn test_exponent_number() {
756 let text = "[1e10,2e-5]";
757 let toks = tokenize_str(text).unwrap();
758 let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Exponent, 5), (5, Comma, 6), (6, Exponent, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
759 assert_eq!(toks, expected);
760 }
761
762 #[test]
763 fn test_whitespace() {
764 let text = " {\n\t} ";
765 let toks = Tokenizer::with_configuration(text, TokenizerConfig{include_whitespace: true, include_comments: true, allow_octal: false}).tokenize().unwrap();
766 let expected = Tokens{ tok_spans: vec![(0, Whitespace, 1), (1, LeftBrace, 2), (2, Whitespace, 4), (4, RightBrace, 5), (5, Whitespace, 6), (6, EOF, 6)], source: text};
767 assert_eq!(toks, expected);
768 }
769
770 #[test]
771 fn test_true_false_null() {
772 let text = "[true,false,null]";
773 let toks = tokenize_str(text).unwrap();
774 let expected = Tokens{source: text, tok_spans: vec![(0, LeftBracket, 1), (1, True, 5), (5, Comma, 6), (6, False, 11), (11, Comma, 12), (12, Null, 16), (16, RightBracket, 17), (17, EOF, 17)]};
775 assert_eq!(toks, expected);
776 }
777
778 #[test]
779 fn test_number() {
780 let text = "123";
781 let toks = tokenize_str(text).unwrap();
782 let expected = Tokens{source: text, tok_spans: vec![(0, Integer, 3), (3, EOF, 3)]};
783 assert_eq!(toks, expected);
784
785 }
786
787 #[test]
788 fn test_unexpected_symbol() {
789 let text = "1!2";
790 tokenize_str(text).unwrap_err();
791 }
792
793 #[test]
794 fn test_special_things() {
795 let text = r#"{$_:1,_$:2,a\u200C:3}"#;
796 let toks = tokenize_str(text).unwrap();
797 let expected = Tokens{source: text, tok_spans: vec![(0, LeftBrace, 1), (1, Name, 3), (3, Colon, 4), (4, Integer, 5), (5, Comma, 6), (6, Name, 8), (8, Colon, 9), (9, Integer, 10), (10, Comma, 11), (11, Name, 18), (18, Colon, 19), (19, Integer, 20), (20, RightBrace, 21), (21, EOF, 21)]};
798 assert_eq!(toks, expected)
799 }
800
801 #[test]
802 fn test_eof_after_multibyte() {
803 let text = r#"ë"#;
804 let toks = tokenize_str(text).unwrap();
805 let expected = Tokens{source: text, tok_spans: vec![(0, Name, 2), (2, EOF, 2)]};
806 assert_eq!(toks, expected)
807
808 }
809}