json_five/
tokenize.rs

1use std::fmt::{Display, Formatter};
2use std::iter::{Peekable};
3use std::str::{CharIndices};
4use crate::utils::get_line_col_char;
5
6#[derive(PartialEq, Clone, Debug)]
7pub enum TokType {
8    LeftBrace,
9    RightBrace,
10    LeftBracket,
11    RightBracket,
12    Comma,
13    Colon,
14    Name,
15    SingleQuotedString,
16    DoubleQuotedString,
17    BlockComment,
18    LineComment,
19    Whitespace,
20    True,
21    False,
22    Null,
23    Integer,
24    Float,
25    Infinity,
26    Nan,
27    Exponent,
28    Hexadecimal,
29    // Octal,
30    Plus,
31    Minus,
32    EOF,
33}
34
35
36// Start byte offset, token type, end byte offset (noninclusive)
37pub(crate) type TokenSpan = (usize, TokType, usize);
38
39
40
41
42#[derive(Debug, PartialEq)]
43pub struct Tokens<'input> {
44    pub tok_spans: Vec<TokenSpan>,
45    pub(crate) source: &'input str
46}
47
48
49#[derive(Debug)]
50pub struct TokenizationError {
51    pub message: String,
52    pub index: usize, // byte offset
53    pub lineno: usize,
54    pub colno: usize,
55    pub char_index: usize // char offset
56}
57
58impl<'input> Display for TokenizationError {
59    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
60        write!(f, "TokenizationError: {}: line {} column {} (char {})", self.message, self.lineno, self.colno, self.char_index)
61    }
62}
63
64#[derive(Debug)]
65pub(crate) struct Tokenizer<'input> {
66    configuration: TokenizerConfig,
67    text: &'input str,
68    chars: Peekable<CharIndices<'input>>,
69    lookahead: Option<(usize, char)>,
70}
71
72
73const HEX_CHARS: &str = "0123456789abcdefABCDEF";
74const IDENTIFIER_START_SYMBOLS: &str = "$_";
75const IDENTIFIER_PARTS: &str = "$_\u{200C}\u{200D}\u{005F}\u{203F}\u{2040}\u{2054}\u{FE33}\u{FE34}\u{FE4D}\u{FE4E}\u{FE4F}\u{FF3F}";
76#[derive(Debug)]
77pub struct TokenizerConfig {
78    pub include_whitespace: bool,
79    pub include_comments: bool,
80    pub allow_octal: bool,
81}
82
83impl TokenizerConfig {
84    pub fn new() -> Self {
85        TokenizerConfig {include_whitespace: false, include_comments: false, allow_octal: false}
86    }
87}
88
89impl <'input> Tokenizer<'input> {
90    pub fn new(text: &'input str) -> Self {
91        Tokenizer {configuration: TokenizerConfig::new(), text: text, chars: text.char_indices().peekable(), lookahead: None}
92    }
93
94    pub fn with_configuration(text: &'input str, configuration: TokenizerConfig) -> Self {
95        Tokenizer {configuration: configuration, text: text, chars: text.char_indices().peekable(), lookahead: None}
96    }
97
98    fn advance(&mut self) -> Option<(usize, char)> {
99        self.lookahead = self.chars.next();
100        self.lookahead
101    }
102
103    fn make_error(&self, message: String, start_index: usize) -> TokenizationError {
104        let (lineno, colno, char_index) = get_line_col_char(self.text, start_index);
105        TokenizationError{message: message, index: start_index, lineno: lineno, colno: colno, char_index: char_index}
106    }
107
108    fn process_string(&mut self) -> Result<TokenSpan, TokenizationError> {
109        let (start_idx, quote_char) = self.lookahead.expect("Expected quote character");
110
111        let string_type: TokType = match quote_char {
112            '"' => TokType::DoubleQuotedString,
113            '\'' => TokType::SingleQuotedString,
114            _ => unreachable!("Expected quote character, but got {:?}", quote_char)
115        };
116
117        let mut last_char = quote_char;
118
119        let mut escaping = false;
120        loop {
121            match self.advance() {
122                None => {
123                    break Err(self.make_error("Unterminated string starting at".to_string(), start_idx))
124                },
125                Some((idx, char)) => {
126                    match char {
127                        '\\' => {
128                            escaping = !escaping;
129                            last_char = char;
130                            continue
131                        }
132                        '\n' => {
133                            if !escaping && last_char != '\r' {
134                                break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
135                            }
136                            escaping = false;
137                            last_char = char;
138                            continue
139                        }
140                        '\r' | '\u{2028}' | '\u{2029}' => {
141                            if !escaping {
142                                break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
143                            }
144                            escaping = false;
145                            last_char = char;
146                            continue
147                        },
148                        c if c == quote_char && !escaping => {
149                            break Ok((start_idx, string_type, idx+1))
150                        },
151                        _ => {
152                            escaping = false;
153                            last_char = char;
154                            continue
155                        }
156                    }
157                }
158            }
159        }
160    }
161
162    fn process_whitespace(&mut self) -> Result<TokenSpan, TokenizationError> {
163        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting whitespace char");
164        let mut last_index = start_idx;
165        let mut last_char = start_char;
166        loop {
167            match self.chars.peek() {
168                None => break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8())),
169                Some((peeked_idx, peeked_char)) => {
170                    if peeked_char.is_whitespace() {
171                        last_index = *peeked_idx;
172                        last_char = *peeked_char;
173                        self.advance();
174                        continue
175                    } else {
176                        break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8()))
177                    }
178                }
179            }
180        }
181    }
182
183    fn process_octal(&mut self) -> Result<TokenSpan, TokenizationError> {
184        let (start_idx, _start_char) = self.lookahead.expect("Unexpected end of input, was processing octal");
185        if self.configuration.allow_octal {
186            todo!()
187        } else {
188            Err(self.make_error("Octal literals are forbidden".to_string(), start_idx))
189        }
190    }
191
192    fn process_hexadecimal(&mut self) -> Result<TokenSpan, TokenizationError> {
193        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
194        let (_, x_char) = self.advance().expect("Expected hex x");
195        assert_eq!(start_char, '0');
196        if x_char != 'x' && x_char != 'X' {
197            unreachable!("Invalid hexadecimal here")
198        }
199
200        match self.advance() {
201            None => {
202                return Err(self.make_error("Expected at least one digit in hexadecimal literal".to_string(), start_idx))
203            }
204            Some((mut last_idx, first_digit)) => {
205                if !HEX_CHARS.contains(first_digit) {
206                    return Err(self.make_error(format!("Invalid hexadecimal character {:?} in literal starting at", first_digit), start_idx))
207                }
208                loop {
209                    match self.chars.peek() {
210                        None => break Ok((start_idx, TokType::Hexadecimal, last_idx+1)),
211                        Some((offset, char)) => {
212                            if !HEX_CHARS.contains(*char) {
213                                break Ok((start_idx, TokType::Hexadecimal, last_idx+1))
214                            }
215                            last_idx = *offset;
216                            self.advance();
217                            continue
218                        }
219                    }
220                }
221            }
222        }
223    }
224
225    fn process_number(&mut self) -> Result<TokenSpan, TokenizationError>{
226        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
227
228        let maybe_second_char = self.chars.peek();
229        match maybe_second_char {
230            None => return Ok((start_idx, TokType::Integer, start_idx + 1)),
231            Some((_second_idx, second_char)) if start_char == '0' => {
232                match second_char {
233                    'x' | 'X' => {return self.process_hexadecimal()}
234                    sc if sc.is_ascii_digit() => {
235                        return self.process_octal()
236                    },
237                    _ => {}
238                }
239            }
240            _ => {}
241        }
242
243        let mut last_index = start_idx;
244        let mut decimal_seen: bool = false;
245        let mut exponent_seen: bool = false;
246        let mut unary_seen: bool = false;
247        match start_char {
248            '.' => {decimal_seen = true}
249            '+' | '-' => {unary_seen = true}
250            _ => {}
251        }
252        loop {
253            match self.chars.peek() {
254                None => {
255                    if unary_seen || exponent_seen {
256                        let (_, last_char) = self.lookahead.unwrap();
257                        if "+-eE".contains(last_char) {
258                            return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
259                        }
260                    }
261                    if exponent_seen {
262                        break Ok((start_idx, TokType::Exponent, last_index+1))
263                    } else if decimal_seen {
264                        if start_idx == last_index {
265                            return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
266                        }
267                        break Ok((start_idx, TokType::Float, last_index+1))
268                    } else {
269                        break Ok((start_idx, TokType::Integer, last_index+1))
270                    }
271                },
272                Some((next_idx, next_char)) => {
273                    match *next_char {
274                        c if c.is_ascii_digit() => {
275                            last_index = *next_idx;
276                            self.advance();
277                            continue
278                        },
279                        '.' => {
280                            if decimal_seen {
281                                return Err(self.make_error("Invalid number literal (unexpected decimal)".to_string(), start_idx))
282                            }
283                            decimal_seen = true;
284                            if exponent_seen {
285                                return Err(self.make_error("Invalid exponent literal (float exponents forbidden) at".to_string(), start_idx))
286                            }
287                            last_index = *next_idx;
288                            self.advance();
289                            continue
290                        },
291                        'e' | 'E' => {
292                            if exponent_seen {
293                                return Err(self.make_error("Invalid number literal (only one exponent part is allowed)".to_string(), start_idx))
294                            }
295                            exponent_seen = true;
296                            last_index = *next_idx;
297                            self.advance();
298                        }
299                        '+' | '-' => {
300                            let (_, previous_char) = self.lookahead.unwrap();
301                            unary_seen = true;
302                            match previous_char {
303                                'e' | 'E' => {
304                                    last_index = *next_idx;
305                                    self.advance();
306                                }
307                                _ => {
308                                    return Err(self.make_error("Unary within number literal only allowed after exponent part".to_string(), start_idx))
309                                }
310                            }
311                        }
312                        _ => {
313                            // The peeked character can't be part of a number
314                            // Verify the number is valid
315                            if unary_seen || exponent_seen {
316                                let (_, last_char) = self.lookahead.unwrap();
317                                if "+-eE".contains(last_char) {
318                                    return Err(self.make_error(format!("Invalid number literal (missing digit after {:?})", last_char), start_idx))
319                                }
320                            }
321                            if exponent_seen {
322                                break Ok((start_idx, TokType::Exponent, last_index+1))
323                            } else if decimal_seen {
324                                if start_idx == last_index {
325                                    return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
326                                }
327                                break Ok((start_idx, TokType::Float, last_index+1))
328                            } else {
329                                break Ok((start_idx, TokType::Integer, last_index+1))
330                            }
331                        }
332                    }
333                }
334            }
335        }
336
337    }
338
339    fn tok_from_indices(&self, start: usize, end: usize) -> Result<TokenSpan, TokenizationError> {
340        let lexeme= &self.text[start .. end];
341        match lexeme {
342            "true" => Ok((start, TokType::True, end)),
343            "false" => Ok((start, TokType::False, end)),
344            "NaN" => Ok((start, TokType::Nan, end)),
345            "Infinity" => Ok((start, TokType::Infinity, end)),
346            "null" => Ok((start, TokType::Null, end)),
347            _ => {
348                Ok((start, TokType::Name, end))
349            }
350        }
351    }
352
353    fn process_identifier_or_const(&mut self) -> Result<TokenSpan, TokenizationError> {
354        use crate::utils::read_hex_digits;
355        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting identifier/const char");
356        let mut last_idx = start_idx;
357        use unicode_general_category::{get_general_category, GeneralCategory};
358        match start_char {
359            c if c.is_alphabetic() => {}
360            c if IDENTIFIER_START_SYMBOLS.contains(c) => {}
361            '\\' => {
362                match self.chars.peek() {
363                    None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
364                    Some((_, c)) => {
365                        match c {
366                            'u' => {
367                                let mut ubuffer = String::with_capacity(4);
368                                self.advance();
369                                for _ in 0..4 {
370                                    match self.advance() {
371                                        None => {
372                                            return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
373                                        }
374                                        Some((idx, c)) => {
375                                            ubuffer.push(c);
376                                            last_idx = idx;
377                                            if !HEX_CHARS.contains(c) {
378                                                return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
379                                            }
380                                        }
381                                    }
382                                }
383                                let maybe_hex_val = read_hex_digits(&mut ubuffer.chars().peekable(), 4, ubuffer.as_str());
384                                match maybe_hex_val {
385                                    Err(_) => {
386                                        return Err(self.make_error(format!("invalid unicode escape: \\u{}", ubuffer), start_idx))
387                                    }
388                                    Ok(hex_val) => {
389                                        let maybe_c = char::from_u32(hex_val);
390                                        match maybe_c {
391                                            None => {
392                                                return Err(self.make_error(format!("invalid unicode escape value: \\u{}", ubuffer), start_idx))
393                                            }
394                                            Some(c) => {
395                                                if !c.is_alphabetic() && !IDENTIFIER_START_SYMBOLS.contains(c) {
396                                                    return Err(self.make_error(format!("Illegal identifier start from unicode escape sequence: \\u{}", ubuffer), start_idx))
397                                                }
398                                            }
399                                        }
400                                    }
401                                }
402                            }
403                            _ => {
404                                return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
405                            }
406                        }
407                    }
408                }
409            }
410            _ => {
411                return Err(self.make_error(format!("Invalid character {}", start_char), start_idx))
412            }
413        }
414        let mut last_char = start_char;
415        loop {
416            match self.chars.peek() {
417                None => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8()),
418                Some((next_idx, next_char)) => {
419                    if next_char.is_whitespace() {
420                        break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
421                    } else if next_char.is_alphanumeric() {
422                        last_idx = *next_idx;
423                        last_char = *next_char;
424                        self.advance();
425                        continue
426                    } else if IDENTIFIER_PARTS.contains(*next_char) {
427                        last_idx = *next_idx;
428                        last_char = *next_char;
429                        self.advance();
430                        continue
431                    } else if *next_char == '\\' {
432                        self.advance();
433                        match self.advance() {
434                            None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
435                            Some((_, c)) => {
436                                match c {
437                                    'u' => {
438                                        for _ in 0..4 {
439                                            match self.advance() {
440                                                None => {
441                                                    return Err(self.make_error("Invalid unquoted key1".to_string(), start_idx))
442                                                }
443                                                Some((_, c)) => {
444                                                    if !HEX_CHARS.contains(c) {
445
446                                                        return Err(self.make_error("Invalid unquoted key2".to_string(), start_idx))
447                                                    }
448
449                                                }
450                                            }
451                                        }
452                                        (last_idx, last_char) = self.lookahead.unwrap()
453                                    }
454                                    _ => {
455                                        return Err(self.make_error("Invalid unquoted key3".to_string(), start_idx))
456                                    }
457                                }
458                            }
459                        }
460                    } else {
461                        match get_general_category(*next_char) {
462                            GeneralCategory::NonspacingMark | GeneralCategory::SpacingMark => {
463                                last_idx = *next_idx;
464                                last_char = *next_char;
465                                self.advance();
466                                continue
467                            }
468                            _ => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
469                        }
470                    }
471                }
472            }
473        }
474    }
475
476    fn process_comment(&mut self) -> Result<TokenSpan, TokenizationError> {
477        let (start_idx, _char) = self.lookahead.expect("Expected comment start");
478        let (mut last_idx, star_or_slash) = self.advance().expect("Expected second comment char");
479        match star_or_slash {
480            '/' => {
481                // line comment
482                loop {
483                    match self.chars.peek() {
484                        None => {
485                            return Ok((start_idx, TokType::LineComment, last_idx+1))
486                        },
487                        Some((peeked_idx, peeked_char)) => {
488                            match peeked_char {
489                                '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
490                                    (last_idx, _) = self.advance().unwrap();
491                                    return Ok((start_idx, TokType::LineComment, last_idx+1))
492                                }
493                                _ => {
494                                    last_idx = *peeked_idx;
495                                    self.advance();
496                                }
497                            }
498                        }
499                    }
500                }
501            },
502            '*' => {
503                // block comment
504                loop {
505                    match self.chars.peek() {
506                        None => {
507                            return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
508                        }
509                        Some((_peeked_idx, peeked_char)) => {
510                            match peeked_char {
511                                '*' => {
512                                    self.advance();
513                                    let maybe_next_next = self.chars.peek();
514                                    match maybe_next_next {
515                                        None => {
516                                            return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
517                                        },
518                                        Some((_next_peeked_idx, next_peeked_char)) => {
519                                            match next_peeked_char {
520                                                '/' => {
521                                                    (last_idx, _) = self.advance().unwrap();
522                                                    return Ok((start_idx, TokType::BlockComment, last_idx))
523                                                }
524                                                _ => {
525                                                    continue
526                                                }
527                                            }
528                                        }
529                                    }
530                                }
531                                _ => {
532                                    self.advance();
533                                    continue
534                                }
535                            }
536                        }
537                    }
538                }
539            }
540            _ => unreachable!("Invalid second comment char")
541        }
542    }
543
544    fn next_token(&mut self) -> Result<TokenSpan, TokenizationError> {
545        let maybe_last = self.lookahead;
546        let maybe_next = self.advance();
547        match maybe_next {
548            None => {
549                match maybe_last {
550                    Some((last_idx, last_char)) => Ok((last_idx + last_char.len_utf8(), TokType::EOF, last_idx + last_char.len_utf8())),
551                    None => Ok((0, TokType::EOF, 0)),
552                }
553            }
554            Some((next_idx, next)) => {
555                match next {
556                    '{' => Ok((next_idx, TokType:: LeftBrace, next_idx + 1)),
557                    '}' => Ok((next_idx, TokType:: RightBrace, next_idx + 1)),
558                    '[' => Ok((next_idx, TokType:: LeftBracket, next_idx + 1)),
559                    ']' => Ok((next_idx, TokType:: RightBracket, next_idx + 1)),
560                    ',' => Ok((next_idx, TokType:: Comma, next_idx + 1)),
561                    ':' => Ok((next_idx, TokType:: Colon, next_idx + 1)),
562                    '+' => Ok((next_idx, TokType:: Plus, next_idx + 1)),
563                    '-' => Ok((next_idx, TokType:: Minus, next_idx + 1)),
564                    '\'' | '"' => self.process_string(),
565                    '.' => self.process_number(),
566                    '\u{FEFF}' => {
567                        let whitespace_tok = self.process_whitespace()?;
568                        if self.configuration.include_whitespace {
569                            Ok(whitespace_tok)
570                        } else {
571                            self.next_token()
572                        }
573                    }
574                    c if c.is_whitespace() => {
575                        let whitespace_tok = self.process_whitespace()?;
576                        if self.configuration.include_whitespace {
577                            Ok(whitespace_tok)
578                        } else {
579                            self.next_token()
580                        }
581                    },
582                    c if c.is_ascii_digit() => self.process_number(),
583                    '/' => {
584                        let (_, next_next) = self.chars.peek().unwrap_or(&(usize::MAX, '!'));
585                        match next_next {
586                            '/' | '*' => {
587                                if self.configuration.include_comments {
588                                    self.process_comment()
589                                } else {
590                                    self.process_comment()?;
591                                    self.next_token()
592                                }
593                            },
594                            _ => {
595                                return Err(self.make_error("unexpected token '/'".to_string(), next_idx))
596                            }
597                        }
598                    }
599                    _ => self.process_identifier_or_const()
600                }
601            }
602        }
603    }
604
605    pub(crate) fn tokenize(&mut self) -> Result<Tokens<'input>, TokenizationError> {
606        let mut tokens: Vec<TokenSpan> = Vec::new();
607        loop {
608            let tok = self.next_token()?;
609            if tok.1 == TokType::EOF {
610                tokens.push(tok);
611                break
612            } else {
613                tokens.push(tok);
614            }
615        }
616        Ok(Tokens{ tok_spans: tokens, source: self.text})
617    }
618}
619
620impl<'input> Iterator for Tokenizer<'input> {
621    type Item = Result<TokenSpan, TokenizationError>;
622    fn next(&mut self) -> Option<Self::Item> {
623        match self.next_token() {
624            Ok(span) => {
625                match span.1 {
626                    TokType::EOF => {
627                        None
628                    }
629                    _ => Some(Ok(span))
630                }
631            }
632            Err(e) => {
633                Some(Err(e))
634            }
635        }
636    }
637}
638
639
640/// Turn str into [Tokens].
641///
642/// Usually not used directly.
643pub fn tokenize_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
644    Tokenizer::new(text).tokenize()
645}
646
647pub fn tokenize_rt_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
648    let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
649    Tokenizer::with_configuration(text, config).tokenize()
650}
651
652/// Tokenize bytes to [Tokens]
653pub fn tokenize_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
654    let maybe_text = std::str::from_utf8(bytes);
655    match maybe_text {
656        Ok(text) => {
657            Tokenizer::new(text).tokenize()
658        }
659        Err(e) => {
660            let valid_point = e.valid_up_to();
661            if valid_point > 0 {
662                let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
663                let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
664                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
665            } else {
666                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
667            }
668        }
669    }
670}
671
672pub fn tokenize_rt_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
673    let maybe_text = std::str::from_utf8(bytes);
674    match maybe_text {
675        Ok(text) => {
676            let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
677            Tokenizer::with_configuration(text, config).tokenize()
678        }
679        Err(e) => {
680            let valid_point = e.valid_up_to();
681            if valid_point > 0 {
682                let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
683                let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
684                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
685            } else {
686                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
687            }
688        }
689    }
690}
691
692
693
694#[cfg(test)]
695mod test {
696    use crate::tokenize::TokType::*;
697    use super::*;
698    #[test]
699    fn test_foo() {
700        let text = "";
701        let toks = tokenize_str(text).unwrap();
702        let expected = Tokens{ tok_spans: vec![(0, EOF, 0)], source: text};
703        assert_eq!(toks, expected);
704    }
705
706    #[test]
707    fn test_heck() {
708        let text = "{}";
709        let toks = tokenize_str(text).unwrap();
710        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, RightBrace, 2), (2, EOF, 2)], source: text};
711        assert_eq!(toks, expected);
712    }
713
714    #[test]
715    fn test_heck2() {
716        let text = "{\"foo\":\"bar\"}";
717        let toks = tokenize_str(text).unwrap();
718        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
719        assert_eq!(toks, expected)
720    }
721    #[test]
722    fn test_heck3() {
723        let text = "{\"foo\":\"bar\"}";
724        let toks = tokenize_rt_str(text).unwrap();
725        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
726        assert_eq!(toks, expected)
727    }
728
729
730    #[test]
731    fn test_single_quoted_string() {
732        let text = "{'foo':'bar'}";
733        let toks = tokenize_str(text).unwrap();
734        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, SingleQuotedString, 6), (6, Colon, 7), (7, SingleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
735        assert_eq!(toks, expected);
736    }
737
738    #[test]
739    fn test_array() {
740        let text = "[1,2,3]";
741        let toks = tokenize_str(text).unwrap();
742        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Integer, 2), (2, Comma, 3), (3, Integer, 4), (4, Comma, 5), (5, Integer, 6), (6, RightBracket, 7), (7, EOF, 7)], source: text};
743        assert_eq!(toks, expected);
744    }
745
746    #[test]
747    fn test_float_number() {
748        let text = "[1.23,4.56]";
749        let toks = tokenize_str(text).unwrap();
750        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Float, 5), (5, Comma, 6), (6, Float, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
751        assert_eq!(toks, expected);
752    }
753
754    #[test]
755    fn test_exponent_number() {
756        let text = "[1e10,2e-5]";
757        let toks = tokenize_str(text).unwrap();
758        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Exponent, 5), (5, Comma, 6), (6, Exponent, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
759        assert_eq!(toks, expected);
760    }
761
762    #[test]
763    fn test_whitespace() {
764        let text = " {\n\t} ";
765        let toks = Tokenizer::with_configuration(text, TokenizerConfig{include_whitespace: true, include_comments: true, allow_octal: false}).tokenize().unwrap();
766        let expected = Tokens{ tok_spans: vec![(0, Whitespace, 1), (1, LeftBrace, 2), (2, Whitespace, 4), (4, RightBrace, 5), (5, Whitespace, 6), (6, EOF, 6)], source: text};
767        assert_eq!(toks, expected);
768    }
769
770    #[test]
771    fn test_true_false_null() {
772        let text = "[true,false,null]";
773        let toks = tokenize_str(text).unwrap();
774        let expected = Tokens{source: text, tok_spans: vec![(0, LeftBracket, 1), (1, True, 5), (5, Comma, 6), (6, False, 11), (11, Comma, 12), (12, Null, 16), (16, RightBracket, 17), (17, EOF, 17)]};
775        assert_eq!(toks, expected);
776        }
777
778    #[test]
779    fn test_number() {
780        let text = "123";
781        let toks = tokenize_str(text).unwrap();
782        let expected = Tokens{source: text, tok_spans: vec![(0, Integer, 3), (3, EOF, 3)]};
783        assert_eq!(toks, expected);
784
785    }
786
787    #[test]
788    fn test_unexpected_symbol() {
789        let text = "1!2";
790        tokenize_str(text).unwrap_err();
791    }
792
793    #[test]
794    fn test_special_things() {
795        let text = r#"{$_:1,_$:2,a\u200C:3}"#;
796        let toks = tokenize_str(text).unwrap();
797        let expected = Tokens{source: text, tok_spans: vec![(0, LeftBrace, 1), (1, Name, 3), (3, Colon, 4), (4, Integer, 5), (5, Comma, 6), (6, Name, 8), (8, Colon, 9), (9, Integer, 10), (10, Comma, 11), (11, Name, 18), (18, Colon, 19), (19, Integer, 20), (20, RightBrace, 21), (21, EOF, 21)]};
798        assert_eq!(toks, expected)
799    }
800
801    #[test]
802    fn test_eof_after_multibyte() {
803        let text = r#"ë"#;
804        let toks = tokenize_str(text).unwrap();
805        let expected = Tokens{source: text, tok_spans: vec![(0, Name, 2), (2, EOF, 2)]};
806        assert_eq!(toks, expected)
807
808    }
809}
json_five/tokenize.rs

json_five/
tokenize.rs