json_five/
tokenize.rs

1use std::fmt::{Display, Formatter};
2use std::iter::{Peekable};
3use std::str::{CharIndices};
4use crate::utils::get_line_col_char;
5
6#[derive(PartialEq, Clone, Debug)]
7pub enum TokType {
8    LeftBrace,
9    RightBrace,
10    LeftBracket,
11    RightBracket,
12    Comma,
13    Colon,
14    Name,
15    SingleQuotedString,
16    DoubleQuotedString,
17    BlockComment,
18    LineComment,
19    Whitespace,
20    True,
21    False,
22    Null,
23    Integer,
24    Float,
25    Infinity,
26    Nan,
27    Exponent,
28    Hexadecimal,
29    // Octal,
30    Plus,
31    Minus,
32    EOF,
33}
34
35
36// Start byte offset, token type, end byte offset (noninclusive)
37pub(crate) type TokenSpan = (usize, TokType, usize);
38
39
40
41
42#[derive(Debug, PartialEq)]
43pub struct Tokens<'input> {
44    pub tok_spans: Vec<TokenSpan>,
45    pub(crate) source: &'input str
46}
47
48
49#[derive(Debug)]
50pub struct TokenizationError {
51    pub message: String,
52    pub index: usize, // byte offset
53    pub lineno: usize,
54    pub colno: usize,
55    pub char_index: usize // char offset
56}
57
58impl<'input> Display for TokenizationError {
59    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
60        write!(f, "TokenizationError: {}: line {} column {} (char {})", self.message, self.lineno, self.colno, self.char_index)
61    }
62}
63
64#[derive(Debug)]
65pub(crate) struct Tokenizer<'input> {
66    configuration: TokenizerConfig,
67    text: &'input str,
68    chars: Peekable<CharIndices<'input>>,
69    lookahead: Option<(usize, char)>,
70}
71
72
73const HEX_CHARS: &str = "0123456789abcdefABCDEF";
74const IDENTIFIER_START_SYMBOLS: &str = "$_";
75const IDENTIFIER_PARTS: &str = "$_\u{200C}\u{200D}\u{005F}\u{203F}\u{2040}\u{2054}\u{FE33}\u{FE34}\u{FE4D}\u{FE4E}\u{FE4F}\u{FF3F}";
76#[derive(Debug)]
77pub struct TokenizerConfig {
78    pub include_whitespace: bool,
79    pub include_comments: bool,
80    pub allow_octal: bool,
81}
82
83impl Default for TokenizerConfig {
84    fn default() -> Self {
85        Self::new()
86    }
87}
88
89impl TokenizerConfig {
90    pub fn new() -> Self {
91        TokenizerConfig {include_whitespace: false, include_comments: false, allow_octal: false}
92    }
93}
94
95impl <'input> Tokenizer<'input> {
96    pub fn new(text: &'input str) -> Self {
97        Tokenizer {configuration: TokenizerConfig::new(), text, chars: text.char_indices().peekable(), lookahead: None}
98    }
99
100    pub fn with_configuration(text: &'input str, configuration: TokenizerConfig) -> Self {
101        Tokenizer {configuration, text, chars: text.char_indices().peekable(), lookahead: None}
102    }
103
104    fn advance(&mut self) -> Option<(usize, char)> {
105        self.lookahead = self.chars.next();
106        self.lookahead
107    }
108
109    fn make_error(&self, message: String, start_index: usize) -> TokenizationError {
110        let (lineno, colno, char_index) = get_line_col_char(self.text, start_index);
111        TokenizationError{message, index: start_index, lineno, colno, char_index}
112    }
113
114    fn process_string(&mut self) -> Result<TokenSpan, TokenizationError> {
115        let (start_idx, quote_char) = self.lookahead.expect("Expected quote character");
116
117        let string_type: TokType = match quote_char {
118            '"' => TokType::DoubleQuotedString,
119            '\'' => TokType::SingleQuotedString,
120            _ => unreachable!("Expected quote character, but got {:?}", quote_char)
121        };
122
123        let mut last_char = quote_char;
124
125        let mut escaping = false;
126        loop {
127            match self.advance() {
128                None => {
129                    break Err(self.make_error("Unterminated string starting at".to_string(), start_idx))
130                },
131                Some((idx, char)) => {
132                    match char {
133                        '\\' => {
134                            escaping = !escaping;
135                            last_char = char;
136                            continue
137                        }
138                        '\n' => {
139                            if !escaping && last_char != '\r' {
140                                break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
141                            }
142                            escaping = false;
143                            last_char = char;
144                            continue
145                        }
146                        '\r' | '\u{2028}' | '\u{2029}' => {
147                            if !escaping {
148                                break Err(self.make_error("Unexpected line terminator without continuation in string literal at".to_string(), idx))
149                            }
150                            escaping = false;
151                            last_char = char;
152                            continue
153                        },
154                        c if c == quote_char && !escaping => {
155                            break Ok((start_idx, string_type, idx+1))
156                        },
157                        _ => {
158                            escaping = false;
159                            last_char = char;
160                            continue
161                        }
162                    }
163                }
164            }
165        }
166    }
167
168    fn process_whitespace(&mut self) -> Result<TokenSpan, TokenizationError> {
169        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting whitespace char");
170        let mut last_index = start_idx;
171        let mut last_char = start_char;
172        loop {
173            match self.chars.peek() {
174                None => break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8())),
175                Some((peeked_idx, peeked_char)) => {
176                    if peeked_char.is_whitespace() {
177                        last_index = *peeked_idx;
178                        last_char = *peeked_char;
179                        self.advance();
180                        continue
181                    } else {
182                        break Ok((start_idx, TokType::Whitespace, last_index + last_char.len_utf8()))
183                    }
184                }
185            }
186        }
187    }
188
189    fn process_octal(&mut self) -> Result<TokenSpan, TokenizationError> {
190        let (start_idx, _start_char) = self.lookahead.expect("Unexpected end of input, was processing octal");
191        if self.configuration.allow_octal {
192            todo!()
193        } else {
194            Err(self.make_error("Octal literals are forbidden".to_string(), start_idx))
195        }
196    }
197
198    fn process_hexadecimal(&mut self) -> Result<TokenSpan, TokenizationError> {
199        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
200        let (_, x_char) = self.advance().expect("Expected hex x");
201        assert_eq!(start_char, '0');
202        if x_char != 'x' && x_char != 'X' {
203            unreachable!("Invalid hexadecimal here")
204        }
205
206        match self.advance() {
207            None => {
208                Err(self.make_error("Expected at least one digit in hexadecimal literal".to_string(), start_idx))
209            }
210            Some((mut last_idx, first_digit)) => {
211                if !HEX_CHARS.contains(first_digit) {
212                    return Err(self.make_error(format!("Invalid hexadecimal character {first_digit:?} in literal starting at"), start_idx))
213                }
214                loop {
215                    match self.chars.peek() {
216                        None => break Ok((start_idx, TokType::Hexadecimal, last_idx+1)),
217                        Some((offset, char)) => {
218                            if !HEX_CHARS.contains(*char) {
219                                break Ok((start_idx, TokType::Hexadecimal, last_idx+1))
220                            }
221                            last_idx = *offset;
222                            self.advance();
223                            continue
224                        }
225                    }
226                }
227            }
228        }
229    }
230
231    fn process_number(&mut self) -> Result<TokenSpan, TokenizationError>{
232        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting numeric char");
233        let mut last_index = start_idx;
234        let mut decimal_seen: bool = false;
235        let mut exponent_seen: bool = false;
236        let mut unary_seen: bool = false;
237        if start_char == '.' {
238            decimal_seen = true
239        }
240
241        let maybe_second_char = self.chars.peek();
242        match maybe_second_char {
243            None => {
244                if decimal_seen {
245                    return Err(self.make_error("Lone decimal is an invalid literal".to_string(), start_idx))
246                }
247                return Ok((start_idx, TokType::Integer, start_idx + 1))
248            },
249            Some((_second_idx, second_char)) if start_char == '0' => {
250                match second_char {
251                    'x' | 'X' => {return self.process_hexadecimal()}
252                    sc if sc.is_ascii_digit() => {
253                        return self.process_octal()
254                    },
255                    _ => {}
256                }
257            }
258            _ => {}
259        }
260
261        loop {
262            match self.chars.peek() {
263                None => {
264                    if unary_seen || exponent_seen {
265                        let (_, last_char) = self.lookahead.unwrap();
266                        if "+-eE".contains(last_char) {
267                            return Err(self.make_error(format!("Invalid number literal (missing digit after {last_char:?})"), start_idx))
268                        }
269                    }
270                    if exponent_seen {
271                        break Ok((start_idx, TokType::Exponent, last_index+1))
272                    } else if decimal_seen {
273                        if start_idx == last_index {
274                            return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
275                        }
276                        break Ok((start_idx, TokType::Float, last_index+1))
277                    } else {
278                        break Ok((start_idx, TokType::Integer, last_index+1))
279                    }
280                },
281                Some((next_idx, next_char)) => {
282                    match *next_char {
283                        c if c.is_ascii_digit() => {
284                            last_index = *next_idx;
285                            self.advance();
286                            continue
287                        },
288                        '.' => {
289                            if decimal_seen {
290                                return Err(self.make_error("Invalid number literal (unexpected decimal)".to_string(), start_idx))
291                            }
292                            decimal_seen = true;
293                            if exponent_seen {
294                                return Err(self.make_error("Invalid exponent literal (float exponents forbidden) at".to_string(), start_idx))
295                            }
296                            last_index = *next_idx;
297                            self.advance();
298                            continue
299                        },
300                        'e' | 'E' => {
301                            if exponent_seen {
302                                return Err(self.make_error("Invalid number literal (only one exponent part is allowed)".to_string(), start_idx))
303                            }
304                            exponent_seen = true;
305                            last_index = *next_idx;
306                            self.advance();
307                        }
308                        '+' | '-' => {
309                            let (_, previous_char) = self.lookahead.unwrap();
310                            unary_seen = true;
311                            match previous_char {
312                                'e' | 'E' => {
313                                    last_index = *next_idx;
314                                    self.advance();
315                                }
316                                _ => {
317                                    return Err(self.make_error("Unary within number literal only allowed after exponent part".to_string(), start_idx))
318                                }
319                            }
320                        }
321                        _ => {
322                            // The peeked character can't be part of a number
323                            // Verify the number is valid
324                            if unary_seen || exponent_seen {
325                                let (_, last_char) = self.lookahead.unwrap();
326                                if "+-eE".contains(last_char) {
327                                    return Err(self.make_error(format!("Invalid number literal (missing digit after {last_char:?})"), start_idx))
328                                }
329                            }
330                            if exponent_seen {
331                                break Ok((start_idx, TokType::Exponent, last_index+1))
332                            } else if decimal_seen {
333                                if start_idx == last_index {
334                                    return Err(self.make_error("Lone decimal is an invalid number literal".to_string(), start_idx))
335                                }
336                                break Ok((start_idx, TokType::Float, last_index+1))
337                            } else {
338                                break Ok((start_idx, TokType::Integer, last_index+1))
339                            }
340                        }
341                    }
342                }
343            }
344        }
345
346    }
347
348    fn tok_from_indices(&self, start: usize, end: usize) -> Result<TokenSpan, TokenizationError> {
349        let lexeme= &self.text[start .. end];
350        match lexeme {
351            "true" => Ok((start, TokType::True, end)),
352            "false" => Ok((start, TokType::False, end)),
353            "NaN" => Ok((start, TokType::Nan, end)),
354            "Infinity" => Ok((start, TokType::Infinity, end)),
355            "null" => Ok((start, TokType::Null, end)),
356            _ => {
357                Ok((start, TokType::Name, end))
358            }
359        }
360    }
361
362    fn process_identifier_or_const(&mut self) -> Result<TokenSpan, TokenizationError> {
363        use crate::utils::read_hex_digits;
364        let (start_idx, start_char) = self.lookahead.expect("Unexpected end of input, was expecting identifier/const char");
365        let mut last_idx = start_idx;
366        use unicode_general_category::{get_general_category, GeneralCategory};
367        match start_char {
368            c if c.is_alphabetic() => {}
369            c if IDENTIFIER_START_SYMBOLS.contains(c) => {}
370            '\\' => {
371                match self.chars.peek() {
372                    None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
373                    Some((_, c)) => {
374                        match c {
375                            'u' => {
376                                let mut ubuffer = String::with_capacity(4);
377                                self.advance();
378                                for _ in 0..4 {
379                                    match self.advance() {
380                                        None => {
381                                            return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
382                                        }
383                                        Some((idx, c)) => {
384                                            ubuffer.push(c);
385                                            last_idx = idx;
386                                            if !HEX_CHARS.contains(c) {
387                                                return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
388                                            }
389                                        }
390                                    }
391                                }
392                                let maybe_hex_val = read_hex_digits(&mut ubuffer.chars().peekable(), 4, ubuffer.as_str());
393                                match maybe_hex_val {
394                                    Err(_) => {
395                                        return Err(self.make_error(format!("invalid unicode escape: \\u{ubuffer}"), start_idx))
396                                    }
397                                    Ok(hex_val) => {
398                                        let maybe_c = char::from_u32(hex_val);
399                                        match maybe_c {
400                                            None => {
401                                                return Err(self.make_error(format!("invalid unicode escape value: \\u{ubuffer}"), start_idx))
402                                            }
403                                            Some(c) => {
404                                                if !c.is_alphabetic() && !IDENTIFIER_START_SYMBOLS.contains(c) {
405                                                    return Err(self.make_error(format!("Illegal identifier start from unicode escape sequence: \\u{ubuffer}"), start_idx))
406                                                }
407                                            }
408                                        }
409                                    }
410                                }
411                            }
412                            _ => {
413                                return Err(self.make_error("Invalid identifier start".to_string(), start_idx))
414                            }
415                        }
416                    }
417                }
418            }
419            _ => {
420                return Err(self.make_error(format!("Invalid character {start_char}"), start_idx))
421            }
422        }
423        let mut last_char = start_char;
424        loop {
425            match self.chars.peek() {
426                None => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8()),
427                Some((next_idx, next_char)) => {
428                    if next_char.is_whitespace() {
429                        break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
430                    } else if next_char.is_alphanumeric() {
431                        last_idx = *next_idx;
432                        last_char = *next_char;
433                        self.advance();
434                        continue
435                    } else if IDENTIFIER_PARTS.contains(*next_char) {
436                        last_idx = *next_idx;
437                        last_char = *next_char;
438                        self.advance();
439                        continue
440                    } else if *next_char == '\\' {
441                        self.advance();
442                        match self.advance() {
443                            None => {return Err(self.make_error("Unexpected EOF".to_string(), start_idx))}
444                            Some((_, c)) => {
445                                match c {
446                                    'u' => {
447                                        for _ in 0..4 {
448                                            match self.advance() {
449                                                None => {
450                                                    return Err(self.make_error("Invalid unquoted key1".to_string(), start_idx))
451                                                }
452                                                Some((_, c)) => {
453                                                    if !HEX_CHARS.contains(c) {
454
455                                                        return Err(self.make_error("Invalid unquoted key2".to_string(), start_idx))
456                                                    }
457
458                                                }
459                                            }
460                                        }
461                                        (last_idx, last_char) = self.lookahead.unwrap()
462                                    }
463                                    _ => {
464                                        return Err(self.make_error("Invalid unquoted key3".to_string(), start_idx))
465                                    }
466                                }
467                            }
468                        }
469                    } else {
470                        match get_general_category(*next_char) {
471                            GeneralCategory::NonspacingMark | GeneralCategory::SpacingMark => {
472                                last_idx = *next_idx;
473                                last_char = *next_char;
474                                self.advance();
475                                continue
476                            }
477                            _ => break self.tok_from_indices(start_idx, last_idx + last_char.len_utf8())
478                        }
479                    }
480                }
481            }
482        }
483    }
484
485    fn process_comment(&mut self) -> Result<TokenSpan, TokenizationError> {
486        let (start_idx, _char) = self.lookahead.expect("Expected comment start");
487        let (mut last_idx, star_or_slash) = self.advance().expect("Expected second comment char");
488        match star_or_slash {
489            '/' => {
490                // line comment
491                loop {
492                    match self.chars.peek() {
493                        None => {
494                            return Ok((start_idx, TokType::LineComment, last_idx+1))
495                        },
496                        Some((peeked_idx, peeked_char)) => {
497                            match peeked_char {
498                                '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
499                                    (last_idx, _) = self.advance().unwrap();
500                                    return Ok((start_idx, TokType::LineComment, last_idx+1))
501                                }
502                                _ => {
503                                    last_idx = *peeked_idx;
504                                    self.advance();
505                                }
506                            }
507                        }
508                    }
509                }
510            },
511            '*' => {
512                // block comment
513                loop {
514                    match self.chars.peek() {
515                        None => {
516                            return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
517                        }
518                        Some((_peeked_idx, peeked_char)) => {
519                            match peeked_char {
520                                '*' => {
521                                    self.advance();
522                                    let maybe_next_next = self.chars.peek();
523                                    match maybe_next_next {
524                                        None => {
525                                            return Err(self.make_error("Unexpected end of input while processing block comment".to_string(), start_idx))
526                                        },
527                                        Some((_next_peeked_idx, next_peeked_char)) => {
528                                            match next_peeked_char {
529                                                '/' => {
530                                                    (last_idx, _) = self.advance().unwrap();
531                                                    return Ok((start_idx, TokType::BlockComment, last_idx))
532                                                }
533                                                _ => {
534                                                    continue
535                                                }
536                                            }
537                                        }
538                                    }
539                                }
540                                _ => {
541                                    self.advance();
542                                    continue
543                                }
544                            }
545                        }
546                    }
547                }
548            }
549            _ => unreachable!("Invalid second comment char")
550        }
551    }
552
553    fn next_token(&mut self) -> Result<TokenSpan, TokenizationError> {
554        let maybe_last = self.lookahead;
555        let maybe_next = self.advance();
556        match maybe_next {
557            None => {
558                match maybe_last {
559                    Some((last_idx, last_char)) => Ok((last_idx + last_char.len_utf8(), TokType::EOF, last_idx + last_char.len_utf8())),
560                    None => Ok((0, TokType::EOF, 0)),
561                }
562            }
563            Some((next_idx, next)) => {
564                match next {
565                    '{' => Ok((next_idx, TokType:: LeftBrace, next_idx + 1)),
566                    '}' => Ok((next_idx, TokType:: RightBrace, next_idx + 1)),
567                    '[' => Ok((next_idx, TokType:: LeftBracket, next_idx + 1)),
568                    ']' => Ok((next_idx, TokType:: RightBracket, next_idx + 1)),
569                    ',' => Ok((next_idx, TokType:: Comma, next_idx + 1)),
570                    ':' => Ok((next_idx, TokType:: Colon, next_idx + 1)),
571                    '+' => Ok((next_idx, TokType:: Plus, next_idx + 1)),
572                    '-' => Ok((next_idx, TokType:: Minus, next_idx + 1)),
573                    '\'' | '"' => self.process_string(),
574                    '.' => self.process_number(),
575                    '\u{FEFF}' => {
576                        let whitespace_tok = self.process_whitespace()?;
577                        if self.configuration.include_whitespace {
578                            Ok(whitespace_tok)
579                        } else {
580                            self.next_token()
581                        }
582                    }
583                    c if c.is_whitespace() => {
584                        let whitespace_tok = self.process_whitespace()?;
585                        if self.configuration.include_whitespace {
586                            Ok(whitespace_tok)
587                        } else {
588                            self.next_token()
589                        }
590                    },
591                    c if c.is_ascii_digit() => self.process_number(),
592                    '/' => {
593                        let (_, next_next) = self.chars.peek().unwrap_or(&(usize::MAX, '!'));
594                        match next_next {
595                            '/' | '*' => {
596                                if self.configuration.include_comments {
597                                    self.process_comment()
598                                } else {
599                                    self.process_comment()?;
600                                    self.next_token()
601                                }
602                            },
603                            _ => {
604                                Err(self.make_error("unexpected token '/'".to_string(), next_idx))
605                            }
606                        }
607                    }
608                    _ => self.process_identifier_or_const()
609                }
610            }
611        }
612    }
613
614    pub(crate) fn tokenize(&mut self) -> Result<Tokens<'input>, TokenizationError> {
615        let mut tokens: Vec<TokenSpan> = Vec::new();
616        loop {
617            let tok = self.next_token()?;
618            if tok.1 == TokType::EOF {
619                tokens.push(tok);
620                break
621            } else {
622                tokens.push(tok);
623            }
624        }
625        Ok(Tokens{ tok_spans: tokens, source: self.text})
626    }
627}
628
629impl<'input> Iterator for Tokenizer<'input> {
630    type Item = Result<TokenSpan, TokenizationError>;
631    fn next(&mut self) -> Option<Self::Item> {
632        match self.next_token() {
633            Ok(span) => {
634                match span.1 {
635                    TokType::EOF => {
636                        None
637                    }
638                    _ => Some(Ok(span))
639                }
640            }
641            Err(e) => {
642                Some(Err(e))
643            }
644        }
645    }
646}
647
648
649/// Turn str into [Tokens].
650///
651/// Usually not used directly.
652/// Token spans will not include whitespace and comment tokens
653pub fn tokenize_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
654    Tokenizer::new(text).tokenize()
655}
656
657/// Like [tokenize_str] but includes whitespace and comment tokens
658pub fn tokenize_rt_str(text: &'_ str) -> Result<Tokens<'_>, TokenizationError> {
659    let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
660    Tokenizer::with_configuration(text, config).tokenize()
661}
662
663/// Like [tokenize_str] but for bytes
664pub fn tokenize_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
665    let maybe_text = std::str::from_utf8(bytes);
666    match maybe_text {
667        Ok(text) => {
668            Tokenizer::new(text).tokenize()
669        }
670        Err(e) => {
671            let valid_point = e.valid_up_to();
672            if valid_point > 0 {
673                let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
674                let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
675                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
676            } else {
677                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
678            }
679        }
680    }
681}
682
683/// Like [tokenize_rt_str] but for bytes
684pub fn tokenize_rt_bytes(bytes: &'_ [u8]) -> Result<Tokens<'_>, TokenizationError> {
685    let maybe_text = std::str::from_utf8(bytes);
686    match maybe_text {
687        Ok(text) => {
688            let config = TokenizerConfig{include_comments: true, include_whitespace: true, allow_octal: false};
689            Tokenizer::with_configuration(text, config).tokenize()
690        }
691        Err(e) => {
692            let valid_point = e.valid_up_to();
693            if valid_point > 0 {
694                let valid_text = std::str::from_utf8(&bytes[..valid_point]).unwrap();
695                let (lineno, colno, char_index) = get_line_col_char(valid_text, valid_point);
696                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno, colno, char_index, index: valid_point})
697            } else {
698                Err(TokenizationError{message: "Invalid UTF8 at".to_string(), lineno: 1, colno: 0, char_index: 0, index: 0})
699            }
700        }
701    }
702}
703
704
705
706#[cfg(test)]
707mod test {
708    use crate::tokenize::TokType::*;
709    use super::*;
710    #[test]
711    fn test_foo() {
712        let text = "";
713        let toks = tokenize_str(text).unwrap();
714        let expected = Tokens{ tok_spans: vec![(0, EOF, 0)], source: text};
715        assert_eq!(toks, expected);
716    }
717
718    #[test]
719    fn test_heck() {
720        let text = "{}";
721        let toks = tokenize_str(text).unwrap();
722        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, RightBrace, 2), (2, EOF, 2)], source: text};
723        assert_eq!(toks, expected);
724    }
725
726    #[test]
727    fn test_heck2() {
728        let text = "{\"foo\":\"bar\"}";
729        let toks = tokenize_str(text).unwrap();
730        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
731        assert_eq!(toks, expected)
732    }
733    #[test]
734    fn test_heck3() {
735        let text = "{\"foo\":\"bar\"}";
736        let toks = tokenize_rt_str(text).unwrap();
737        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, DoubleQuotedString, 6), (6, Colon, 7), (7, DoubleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
738        assert_eq!(toks, expected)
739    }
740
741
742    #[test]
743    fn test_single_quoted_string() {
744        let text = "{'foo':'bar'}";
745        let toks = tokenize_str(text).unwrap();
746        let expected = Tokens{ tok_spans: vec![(0, LeftBrace, 1), (1, SingleQuotedString, 6), (6, Colon, 7), (7, SingleQuotedString, 12), (12, RightBrace, 13), (13, EOF, 13)], source: text};
747        assert_eq!(toks, expected);
748    }
749
750    #[test]
751    fn test_array() {
752        let text = "[1,2,3]";
753        let toks = tokenize_str(text).unwrap();
754        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Integer, 2), (2, Comma, 3), (3, Integer, 4), (4, Comma, 5), (5, Integer, 6), (6, RightBracket, 7), (7, EOF, 7)], source: text};
755        assert_eq!(toks, expected);
756    }
757
758    #[test]
759    fn test_float_number() {
760        let text = "[1.23,4.56]";
761        let toks = tokenize_str(text).unwrap();
762        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Float, 5), (5, Comma, 6), (6, Float, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
763        assert_eq!(toks, expected);
764    }
765
766    #[test]
767    fn test_exponent_number() {
768        let text = "[1e10,2e-5]";
769        let toks = tokenize_str(text).unwrap();
770        let expected = Tokens{ tok_spans: vec![(0, LeftBracket, 1), (1, Exponent, 5), (5, Comma, 6), (6, Exponent, 10), (10, RightBracket, 11), (11, EOF, 11)], source: text};
771        assert_eq!(toks, expected);
772    }
773
774    #[test]
775    fn test_whitespace() {
776        let text = " {\n\t} ";
777        let toks = Tokenizer::with_configuration(text, TokenizerConfig{include_whitespace: true, include_comments: true, allow_octal: false}).tokenize().unwrap();
778        let expected = Tokens{ tok_spans: vec![(0, Whitespace, 1), (1, LeftBrace, 2), (2, Whitespace, 4), (4, RightBrace, 5), (5, Whitespace, 6), (6, EOF, 6)], source: text};
779        assert_eq!(toks, expected);
780    }
781
782    #[test]
783    fn test_true_false_null() {
784        let text = "[true,false,null]";
785        let toks = tokenize_str(text).unwrap();
786        let expected = Tokens{source: text, tok_spans: vec![(0, LeftBracket, 1), (1, True, 5), (5, Comma, 6), (6, False, 11), (11, Comma, 12), (12, Null, 16), (16, RightBracket, 17), (17, EOF, 17)]};
787        assert_eq!(toks, expected);
788        }
789
790    #[test]
791    fn test_number() {
792        let text = "123";
793        let toks = tokenize_str(text).unwrap();
794        let expected = Tokens{source: text, tok_spans: vec![(0, Integer, 3), (3, EOF, 3)]};
795        assert_eq!(toks, expected);
796
797    }
798
799    #[test]
800    fn test_unexpected_symbol() {
801        let text = "1!2";
802        tokenize_str(text).unwrap_err();
803    }
804
805    #[test]
806    fn test_special_things() {
807        let text = r#"{$_:1,_$:2,a\u200C:3}"#;
808        let toks = tokenize_str(text).unwrap();
809        let expected = Tokens{source: text, tok_spans: vec![(0, LeftBrace, 1), (1, Name, 3), (3, Colon, 4), (4, Integer, 5), (5, Comma, 6), (6, Name, 8), (8, Colon, 9), (9, Integer, 10), (10, Comma, 11), (11, Name, 18), (18, Colon, 19), (19, Integer, 20), (20, RightBrace, 21), (21, EOF, 21)]};
810        assert_eq!(toks, expected)
811    }
812
813    #[test]
814    fn test_eof_after_multibyte() {
815        let text = r#"ë"#;
816        let toks = tokenize_str(text).unwrap();
817        let expected = Tokens{source: text, tok_spans: vec![(0, Name, 2), (2, EOF, 2)]};
818        assert_eq!(toks, expected)
819
820    }
821}
json_five/tokenize.rs

json_five/
tokenize.rs