json_five/
tokenize.rs

1use std::fmt::{Display, Formatter};
2use std::iter::{Peekable};
3use std::str::{CharIndices};
4use crate::utils::get_line_col_char;
5
6#[derive(PartialEq, Clone, Debug)]
7pub enum TokType {
8    LeftBrace,
9    RightBrace,
10    LeftBracket,
11    RightBracket,
12    Comma,
13    Colon,
14    Name,
15    SingleQuotedString,
16    DoubleQuotedString,
17    BlockComment,
18    LineComment,
19    Whitespace,
20    True,
21    False,
22    Null,
23    Integer,
24    Float,
25    Infinity,
26    Nan,
27    Exponent,
28    Hexadecimal,
29    // Octal,
30    Plus,
31    Minus,
32    EOF,
33}
34
35
36// Start byte offset, token type, end byte offset (noninclusive)
37pub(crate) type TokenSpan = (usize, TokType, usize);
38
39
40
41
42#[derive(Debug, PartialEq)]
43pub struct Tokens<'input> {
44    pub tok_spans: Vec<TokenSpan>,
45    pub(crate) source: &'input str
46}
47
48
49#[derive(Debug)]
50pub struct TokenizationError {
51    pub message: String,
52    pub index: usize, // byte offset
53    pub lineno: usize,
54    pub colno: usize,
55    pub char_index: usize // char offset
56}
57
58impl<'input> Display for TokenizationError {
59    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
60        write!(f, "TokenizationError: {}: line {} column {} (char {})", self.message, self.lineno, self.colno, self.char_index)
61    }
62}
63
64#[derive(Debug)]
65pub(crate) struct Tokenizer<'input> {
66    configuration: TokenizerConfig,
67    text: &'input str,
68    chars: Peekable<CharIndices<'input>>,
69    lookahead: Option<(usize, char)>,
70}
71
72
73const HEX_CHARS: &str = "0123456789abcdefABCDEF";
74const IDENTIFIER_START_SYMBOLS: &str = "$_";
75const IDENTIFIER_PARTS: &str = "$_\u{200C}\u{200D}\u{005F}\u{203F}\u{2040}\u{2054}\u{FE33}\u{FE34}\u{FE4D}\u{FE4E}\u{FE4F}\u{FF3F}";
76#[derive(Debug)]
77pub struct TokenizerConfig {
78    pub include_whitespace: bool,
79    pub include_comments: bool,
80    pub allow_octal: bool,
81}
82
83impl TokenizerConfig {
84    pub fn new() -> Self {
85        TokenizerConfig {include_whitespace: false, include_comments: false, allow_octal: false}
86    }
87}
88
89impl <'input> Tokenizer<'input> {
90    pub fn new(text: &'input str) -> Self {
91        Tokenizer {configuration: TokenizerConfig::new(), text: text, chars: text.char_indices().peekable(), lookahead: None}
92    }
93
94    pub fn with_configuration(text: &'input str, configuration: TokenizerConfig) -> Self {
95        Tokenizer {configuration: configuration, text: text, chars: text.char_indices().peekable(), lookahead: None}
96    }
97
98    fn advance(&mut self) -> Option<(usize, char)> {
99        self.lookahead = self.chars.next();
100        self.lookahead
101    }
102
103    fn make_error(&self, message: String, start_index: usize) -> TokenizationError {
104        let (lineno, colno, char_index) = get_line_col_char(self.text, start_index);
105        TokenizationError{message: message, index: start_index, lineno: lineno, colno: colno, char_index: char_index}
106    }
107
108    fn process_string(&mut self) -> Result<TokenSpan, TokenizationError> {
109        let (start_idx, quote_char) = self.lookahead.expect("Expected quote character");
110
111        let string_type: TokType = match quote_char {
112            '"' => TokType::DoubleQuotedString,
113            '\'' => TokType::SingleQuotedString,
114            _ => unreachable!("Expected quote character, but got {:?}", quote_char)
115        };
116
117        let mut last_char = quote_char;
118
119        let mut escaping = false;
120        loop {
121            match self.advance() {
122                None => {
123                    break Err(self.make_error("Unterminated string starting at".to_string(), start_idx))
124                },
125                Some((idx, char)) => {
126                    match char {
127                        '\\' => {
128                            escaping = !escaping;
129                            last_char = char;
130                            continue
131                        }
132                        '\n' => {
133                            if !escaping && last_char != '\r' {
134                                break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
135                            }
136                            escaping = false;
137                            last_char = char;
138                            continue
139                        }
140                        '\r' | '\u{2028}' | '\u{2029}' => {
141                            if !escaping {
142                                break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
143                            }
144                            escaping = false;
145                            last_char = char;
146                            continue
147                        },
148                        c if c == quote_char && !escaping => {
149                            break Ok((start_idx, string_type, idx+1))
150                        },
151                        _ => {
152                            escaping = false;
153                            last_char = char;
154                            continue
155                        }
156                    }
157                }
158            }
159        }
160    }
161
162    fn process_whitespace(&mut self) -> Result<TokenSpan, TokenizationError> {
163        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting whitespace char");
164        let mut last_index = start_idx;
165        let mut last_char = start_char;
166        loop {
167            match self.chars.peek() {
168                None => break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8())),
169                Some((peeked_idx, peeked_char)) => {
170                    if peeked_char.is_whitespace() {
171                        last_index = *peeked_idx;
172                        last_char = *peeked_char;
173                        self.advance();
174                        continue
175                    } else {
176                        break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8()))
177                    }
178                }
179            }
180        }
181    }
182
183    fn process_octal(&mut self) -> Result<TokenSpan, TokenizationError> {
184        let (start_idx, _start_char) = self.lookahead.expect("Unexpected end of input, was processing octal");
185        if self.configuration.allow_octal {
186            todo!()
187        } else {
188            Err(self.make_error("Octal literals are forbidden".to_string(), start_idx))
189        }
190    }
191
192    fn process_hexadecimal(&mut self) -> Result<TokenSpan, TokenizationError> {
193        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
194        let (_, x_char) = self.advance().expect("Expected hex x");
195        assert_eq!(start_char, '0');
196        if x_char != 'x' && x_char != 'X' {
197            unreachable!("Invalid hexadecimal here")
198        }
199
200        match self.advance() {
201            None => {
202                return Err(self.make_error("Expected at least one digit in hexadecimal literal".to_string(), start_idx))
203            }
204            Some((mut last_idx, first_digit)) => {
205                if !HEX_CHARS.contains(first_digit) {
206                    return Err(self.make_error(format!("Invalid hexadecimal character {:?} in literal starting at", first_digit), start_idx))
207                }
208                loop {
209                    match self.chars.peek() {
210                        None => break Ok((start_idx, TokType::Hexadecimal, last_idx+1)),
211                        Some((offset, char)) => {
212                            if !HEX_CHARS.contains(*char) {
213                                break Ok((start_idx, TokType::Hexadecimal, last_idx+1))
214                            }
215                            last_idx = *offset;
216                            self.advance();
217                            continue
218                        }
219                    }
220                }
221            }
222        }
223    }
224
225    fn process_number(&mut self) -> Result<TokenSpan, TokenizationError>{
226        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
227        let mut last_index = start_idx;
228        let mut decimal_seen: bool = false;
229        let mut exponent_seen: bool = false;
230        let mut unary_seen: bool = false;
231        if start_char == '.' {
232            decimal_seen = true
233        }
234
235        let maybe_second_char = self.chars.peek();
236        match maybe_second_char {
237            None => {
238                if decimal_seen {
239                    return Err(self.make_error("Lone decimal is an invalid literal".to_string(), start_idx))
240                }
241                return Ok((start_idx, TokType::Integer, start_idx + 1))
242            },
243            Some((_second_idx, second_char)) if start_char == '0' => {
244                match second_char {
245                    'x' | 'X' => {return self.process_hexadecimal()}
246                    sc if sc.is_ascii_digit() => {
247                        return self.process_octal()
248                    },
249                    _ => {}
250                }
251            }
252            _ => {}
253        }
254
255        loop {
256            match self.chars.peek() {
257                None => {
258                    if unary_seen || exponent_seen {
259                        let (_, last_char) = self.lookahead.unwrap();
260                        if "+-eE".contains(last_char) {
261                            return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
262                        }
263                    }
264                    if exponent_seen {
265                        break Ok((start_idx, TokType::Exponent, last_index+1))
266                    } else if decimal_seen {
267                        if start_idx == last_index {
268                            return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
269                        }
270                        break Ok((start_idx, TokType::Float, last_index+1))
271                    } else {
272                        break Ok((start_idx, TokType::Integer, last_index+1))
273                    }
274                },
275                Some((next_idx, next_char)) => {
276                    match *next_char {
277                        c if c.is_ascii_digit() => {
278                            last_index = *next_idx;
279                            self.advance();
280                            continue
281                        },
282                        '.' => {
283                            if decimal_seen {
284                                return Err(self.make_error("Invalid number literal (unexpected decimal)".to_string(), start_idx))
285                            }
286                            decimal_seen = true;
287                            if exponent_seen {
288                                return Err(self.make_error("Invalid exponent literal (float exponents forbidden) at".to_string(), start_idx))
289                            }
290                            last_index = *next_idx;
291                            self.advance();
292                            continue
293                        },
294                        'e' | 'E' => {
295                            if exponent_seen {
296                                return Err(self.make_error("Invalid number literal (only one exponent part is allowed)".to_string(), start_idx))
297                            }
298                            exponent_seen = true;
299                            last_index = *next_idx;
300                            self.advance();
301                        }
302                        '+' | '-' => {
303                            let (_, previous_char) = self.lookahead.unwrap();
304                            unary_seen = true;
305                            match previous_char {
306                                'e' | 'E' => {
307                                    last_index = *next_idx;
308                                    self.advance();
309                                }
310                                _ => {
311                                    return Err(self.make_error("Unary within number literal only allowed after exponent part".to_string(), start_idx))
312                                }
313                            }
314                        }
315                        _ => {
316                            // The peeked character can't be part of a number
317                            // Verify the number is valid
318                            if unary_seen || exponent_seen {
319                                let (_, last_char) = self.lookahead.unwrap();
320                                if "+-eE".contains(last_char) {
321                                    return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
322                                }
323                            }
324                            if exponent_seen {
325                                break Ok((start_idx, TokType::Exponent, last_index+1))
326                            } else if decimal_seen {
327                                if start_idx == last_index {
328                                    return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
329                                }
330                                break Ok((start_idx, TokType::Float, last_index+1))
331                            } else {
332                                break Ok((start_idx, TokType::Integer, last_index+1))
333                            }
334                        }
335                    }
336                }
337            }
338        }
339
340    }
341
342    fn tok_from_indices(&self, start: usize, end: usize) -> Result<TokenSpan, TokenizationError> {
343        let lexeme= &self.text[start .. end];
344        match lexeme {
345            "true" => Ok((start, TokType::True, end)),
346            "false" => Ok((start, TokType::False, end)),
347            "NaN" => Ok((start, TokType::Nan, end)),
348            "Infinity" => Ok((start, TokType::Infinity, end)),
349            "null" => Ok((start, TokType::Null, end)),
350            _ => {
351                Ok((start, TokType::Name, end))
352            }
353        }
354    }
355
356    fn process_identifier_or_const(&mut self) -> Result<TokenSpan, TokenizationError> {
357        use crate::utils::read_hex_digits;
358        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting identifier/const char");
359        let mut last_idx = start_idx;
360        use unicode_general_category::{get_general_category, GeneralCategory};
361        match start_char {
362            c if c.is_alphabetic() => {}
363            c if IDENTIFIER_START_SYMBOLS.contains(c) => {}
364            '\\' => {
365                match self.chars.peek() {
366                    None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
367                    Some((_, c)) => {
368                        match c {
369                            'u' => {
370                                let mut ubuffer = String::with_capacity(4);
371                                self.advance();
372                                for _ in 0..4 {
373                                    match self.advance() {
374                                        None => {
375                                            return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
376                                        }
377                                        Some((idx, c)) => {
378                                            ubuffer.push(c);
379                                            last_idx = idx;
380                                            if !HEX_CHARS.contains(c) {
381                                                return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
382                                            }
383                                        }
384                                    }
385                                }
386                                let maybe_hex_val = read_hex_digits(&mut ubuffer.chars().peekable(), 4, ubuffer.as_str());
387                                match maybe_hex_val {
388                                    Err(_) => {
389                                        return Err(self.make_error(format!("invalid unicode escape: \\u{}", ubuffer), start_idx))
390                                    }
391                                    Ok(hex_val) => {
392                                        let maybe_c = char::from_u32(hex_val);
393                                        match maybe_c {
394                                            None => {
395                                                return Err(self.make_error(format!("invalid unicode escape value: \\u{}", ubuffer), start_idx))
396                                            }
397                                            Some(c) => {
398                                                if !c.is_alphabetic() && !IDENTIFIER_START_SYMBOLS.contains(c) {
399                                                    return Err(self.make_error(format!("Illegal identifier start from unicode escape sequence: \\u{}", ubuffer), start_idx))
400                                                }
401                                            }
402                                        }
403                                    }
404                                }
405                            }
406                            _ => {
407                                return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
408                            }
409                        }
410                    }
411                }
412            }
413            _ => {
414                return Err(self.make_error(format!("Invalid character {}", start_char), start_idx))
415            }
416        }
417        let mut last_char = start_char;
418        loop {
419            match self.chars.peek() {
420                None => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8()),
421                Some((next_idx, next_char)) => {
422                    if next_char.is_whitespace() {
423                        break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
424                    } else if next_char.is_alphanumeric() {
425                        last_idx = *next_idx;
426                        last_char = *next_char;
427                        self.advance();
428                        continue
429                    } else if IDENTIFIER_PARTS.contains(*next_char) {
430                        last_idx = *next_idx;
431                        last_char = *next_char;
432                        self.advance();
433                        continue
434                    } else if *next_char == '\\' {
435                        self.advance();
436                        match self.advance() {
437                            None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
438                            Some((_, c)) => {
439                                match c {
440                                    'u' => {
441                                        for _ in 0..4 {
442                                            match self.advance() {
443                                                None => {
444                                                    return Err(self.make_error("Invalid unquoted key1".to_string(), start_idx))
445                                                }
446                                                Some((_, c)) => {
447                                                    if !HEX_CHARS.contains(c) {
448
449                                                        return Err(self.make_error("Invalid unquoted key2".to_string(), start_idx))
450                                                    }
451
452                                                }
453                                            }
454                                        }
455                                        (last_idx, last_char) = self.lookahead.unwrap()
456                                    }
457                                    _ => {
458                                        return Err(self.make_error("Invalid unquoted key3".to_string(), start_idx))
459                                    }
460                                }
461                            }
462                        }
463                    } else {
464                        match get_general_category(*next_char) {
465                            GeneralCategory::NonspacingMark | GeneralCategory::SpacingMark => {
466                                last_idx = *next_idx;
467                                last_char = *next_char;
468                                self.advance();
469                                continue
470                            }
471                            _ => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
472                        }
473                    }
474                }
475            }
476        }
477    }
478
479    fn process_comment(&mut self) -> Result<TokenSpan, TokenizationError> {
480        let (start_idx, _char) = self.lookahead.expect("Expected comment start");
481        let (mut last_idx, star_or_slash) = self.advance().expect("Expected second comment char");
482        match star_or_slash {
483            '/' => {
484                // line comment
485                loop {
486                    match self.chars.peek() {
487                        None => {
488                            return Ok((start_idx, TokType::LineComment, last_idx+1))
489                        },
490                        Some((peeked_idx, peeked_char)) => {
491                            match peeked_char {
492                                '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
493                                    (last_idx, _) = self.advance().unwrap();
494                                    return Ok((start_idx, TokType::LineComment, last_idx+1))
495                                }
496                                _ => {
497                                    last_idx = *peeked_idx;
498                                    self.advance();
499                                }
500                            }
501                        }
502                    }
503                }
504            },
505            '*' => {
506                // block comment
507                loop {
508                    match self.chars.peek() {
509                        None => {
510                            return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
511                        }
512                        Some((_peeked_idx, peeked_char)) => {
513                            match peeked_char {
514                                '*' => {
515                                    self.advance();
516                                    let maybe_next_next = self.chars.peek();
517                                    match maybe_next_next {
518                                        None => {
519                                            return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
520                                        },
521                                        Some((_next_peeked_idx, next_peeked_char)) => {
522                                            match next_peeked_char {
523                                                '/' => {
524                                                    (last_idx, _) = self.advance().unwrap();
525                                                    return Ok((start_idx, TokType::BlockComment, last_idx))
526                                                }
527                                                _ => {
528                                                    continue
529                                                }
530                                            }
531                                        }
532                                    }
533                                }
534                                _ => {
535                                    self.advance();
536                                    continue
537                                }
538                            }
539                        }
540                    }
541                }
542            }
543            _ => unreachable!("Invalid second comment char")
544        }
545    }
546
547    fn next_token(&mut self) -> Result<TokenSpan, TokenizationError> {
548        let maybe_last = self.lookahead;
549        let maybe_next = self.advance();
550        match maybe_next {
551            None => {
552                match maybe_last {
553                    Some((last_idx, last_char)) => Ok((last_idx + last_char.len_utf8(), TokType::EOF, last_idx + last_char.len_utf8())),
554                    None => Ok((0, TokType::EOF, 0)),
555                }
556            }
557            Some((next_idx, next)) => {
558                match next {
559                    '{' => Ok((next_idx, TokType:: LeftBrace, next_idx + 1)),
560                    '}' => Ok((next_idx, TokType:: RightBrace, next_idx + 1)),
561                    '[' => Ok((next_idx, TokType:: LeftBracket, next_idx + 1)),
562                    ']' => Ok((next_idx, TokType:: RightBracket, next_idx + 1)),
563                    ',' => Ok((next_idx, TokType:: Comma, next_idx + 1)),
564                    ':' => Ok((next_idx, TokType:: Colon, next_idx + 1)),
565                    '+' => Ok((next_idx, TokType:: Plus, next_idx + 1)),
566                    '-' => Ok((next_idx, TokType:: Minus, next_idx + 1)),
567                    '\'' | '"' => self.process_string(),
568                    '.' => self.process_number(),
569                    '\u{FEFF}' => {
570                        let whitespace_tok = self.process_whitespace()?;
571                        if self.configuration.include_whitespace {
572                            Ok(whitespace_tok)
573                        } else {
574                            self.next_token()
575                        }
576                    }
577                    c if c.is_whitespace() => {
578                        let whitespace_tok = self.process_whitespace()?;
579                        if self.configuration.include_whitespace {
580                            Ok(whitespace_tok)
581                        } else {
582                            self.next_token()
583                        }
584                    },
585                    c if c.is_ascii_digit() => self.process_number(),
586                    '/' => {
587                        let (_, next_next) = self.chars.peek().unwrap_or(&(usize::MAX, '!'));
588                        match next_next {
589                            '/' | '*' => {
590                                if self.configuration.include_comments {
591                                    self.process_comment()
592                                } else {
593                                    self.process_comment()?;
594                                    self.next_token()
595                                }
596                            },
597                            _ => {
598                                return Err(self.make_error("unexpected token '/'".to_string(), next_idx))
599                            }
600                        }
601                    }
602                    _ => self.process_identifier_or_const()
603                }
604            }
605        }
606    }
607
608    pub(crate) fn tokenize(&mut self) -> Result<Tokens<'input>, TokenizationError> {
609        let mut tokens: Vec<TokenSpan> = Vec::new();
610        loop {
611            let tok = self.next_token()?;
612            if tok.1 == TokType::EOF {
613                tokens.push(tok);
614                break
615            } else {
616                tokens.push(tok);
617            }
618        }
619        Ok(Tokens{ tok_spans: tokens, source: self.text})
620    }
621}
622
623impl<'input> Iterator for Tokenizer<'input> {
624    type Item = Result<TokenSpan, TokenizationError>;
625    fn next(&mut self) -> Option<Self::Item> {
626        match self.next_token() {
627            Ok(span) => {
628                match span.1 {
629                    TokType::EOF => {
630                        None
631                    }
632                    _ => Some(Ok(span))
633                }
634            }
635            Err(e) => {
636                Some(Err(e))
637            }
638        }
639    }
640}
641
642
643/// Turn str into [Tokens].
644///
645/// Usually not used directly.
646/// Token spans will not include whitespace and comment tokens
647pub fn tokenize_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
648    Tokenizer::new(text).tokenize()
649}
650
651/// Like [tokenize_str] but includes whitespace and comment tokens
652pub fn tokenize_rt_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
653    let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
654    Tokenizer::with_configuration(text, config).tokenize()
655}
656
657/// Like [tokenize_str] but for bytes
658pub fn tokenize_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
659    let maybe_text = std::str::from_utf8(bytes);
660    match maybe_text {
661        Ok(text) => {
662            Tokenizer::new(text).tokenize()
663        }
664        Err(e) => {
665            let valid_point = e.valid_up_to();
666            if valid_point > 0 {
667                let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
668                let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
669                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
670            } else {
671                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
672            }
673        }
674    }
675}
676
677/// Like [tokenize_rt_str] but for bytes
678pub fn tokenize_rt_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
679    let maybe_text = std::str::from_utf8(bytes);
680    match maybe_text {
681        Ok(text) => {
682            let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
683            Tokenizer::with_configuration(text, config).tokenize()
684        }
685        Err(e) => {
686            let valid_point = e.valid_up_to();
687            if valid_point > 0 {
688                let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
689                let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
690                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
691            } else {
692                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
693            }
694        }
695    }
696}
697
698
699
700#[cfg(test)]
701mod test {
702    use crate::tokenize::TokType::*;
703    use super::*;
704    #[test]
705    fn test_foo() {
706        let text = "";
707        let toks = tokenize_str(text).unwrap();
708        let expected = Tokens{ tok_spans: vec![(0, EOF, 0)], source: text};
709        assert_eq!(toks, expected);
710    }
711
712    #[test]
713    fn test_heck() {
714        let text = "{}";
715        let toks = tokenize_str(text).unwrap();
716        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, RightBrace, 2), (2, EOF, 2)], source: text};
717        assert_eq!(toks, expected);
718    }
719
720    #[test]
721    fn test_heck2() {
722        let text = "{\"foo\":\"bar\"}";
723        let toks = tokenize_str(text).unwrap();
724        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
725        assert_eq!(toks, expected)
726    }
727    #[test]
728    fn test_heck3() {
729        let text = "{\"foo\":\"bar\"}";
730        let toks = tokenize_rt_str(text).unwrap();
731        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
732        assert_eq!(toks, expected)
733    }
734
735
736    #[test]
737    fn test_single_quoted_string() {
738        let text = "{'foo':'bar'}";
739        let toks = tokenize_str(text).unwrap();
740        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, SingleQuotedString, 6), (6, Colon, 7), (7, SingleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
741        assert_eq!(toks, expected);
742    }
743
744    #[test]
745    fn test_array() {
746        let text = "[1,2,3]";
747        let toks = tokenize_str(text).unwrap();
748        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Integer, 2), (2, Comma, 3), (3, Integer, 4), (4, Comma, 5), (5, Integer, 6), (6, RightBracket, 7), (7, EOF, 7)], source: text};
749        assert_eq!(toks, expected);
750    }
751
752    #[test]
753    fn test_float_number() {
754        let text = "[1.23,4.56]";
755        let toks = tokenize_str(text).unwrap();
756        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Float, 5), (5, Comma, 6), (6, Float, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
757        assert_eq!(toks, expected);
758    }
759
760    #[test]
761    fn test_exponent_number() {
762        let text = "[1e10,2e-5]";
763        let toks = tokenize_str(text).unwrap();
764        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Exponent, 5), (5, Comma, 6), (6, Exponent, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
765        assert_eq!(toks, expected);
766    }
767
768    #[test]
769    fn test_whitespace() {
770        let text = " {\n\t} ";
771        let toks = Tokenizer::with_configuration(text, TokenizerConfig{include_whitespace: true, include_comments: true, allow_octal: false}).tokenize().unwrap();
772        let expected = Tokens{ tok_spans: vec![(0, Whitespace, 1), (1, LeftBrace, 2), (2, Whitespace, 4), (4, RightBrace, 5), (5, Whitespace, 6), (6, EOF, 6)], source: text};
773        assert_eq!(toks, expected);
774    }
775
776    #[test]
777    fn test_true_false_null() {
778        let text = "[true,false,null]";
779        let toks = tokenize_str(text).unwrap();
780        let expected = Tokens{source: text, tok_spans: vec![(0, LeftBracket, 1), (1, True, 5), (5, Comma, 6), (6, False, 11), (11, Comma, 12), (12, Null, 16), (16, RightBracket, 17), (17, EOF, 17)]};
781        assert_eq!(toks, expected);
782        }
783
784    #[test]
785    fn test_number() {
786        let text = "123";
787        let toks = tokenize_str(text).unwrap();
788        let expected = Tokens{source: text, tok_spans: vec![(0, Integer, 3), (3, EOF, 3)]};
789        assert_eq!(toks, expected);
790
791    }
792
793    #[test]
794    fn test_unexpected_symbol() {
795        let text = "1!2";
796        tokenize_str(text).unwrap_err();
797    }
798
799    #[test]
800    fn test_special_things() {
801        let text = r#"{$_:1,_$:2,a\u200C:3}"#;
802        let toks = tokenize_str(text).unwrap();
803        let expected = Tokens{source: text, tok_spans: vec![(0, LeftBrace, 1), (1, Name, 3), (3, Colon, 4), (4, Integer, 5), (5, Comma, 6), (6, Name, 8), (8, Colon, 9), (9, Integer, 10), (10, Comma, 11), (11, Name, 18), (18, Colon, 19), (19, Integer, 20), (20, RightBrace, 21), (21, EOF, 21)]};
804        assert_eq!(toks, expected)
805    }
806
807    #[test]
808    fn test_eof_after_multibyte() {
809        let text = r#"ë"#;
810        let toks = tokenize_str(text).unwrap();
811        let expected = Tokens{source: text, tok_spans: vec![(0, Name, 2), (2, EOF, 2)]};
812        assert_eq!(toks, expected)
813
814    }
815}
json_five/tokenize.rs

json_five/
tokenize.rs