chisel_json/lexer/
lexer_core.rs

1//! Lexer used by both DOM and SAX parsers
2//!
3#![allow(unused_assignments)]
4#![allow(unused_variables)]
5#![allow(unreachable_code)]
6
7use crate::coords::{Coords, Span};
8use std::fmt::{Display, Formatter};
9use std::io::BufRead;
10
11use crate::lexer::lexer_input::{CharWithCoords, LexerInput};
12use crate::lexer_error;
13use crate::results::{ParserError, ParserErrorDetails, ParserErrorSource, ParserResult};
14
15/// Default lookahead buffer size
16const DEFAULT_BUFFER_SIZE: usize = 4096;
17/// Pattern to match for null
18const NULL_PATTERN: [char; 4] = ['n', 'u', 'l', 'l'];
19/// Pattern to match for true
20const TRUE_PATTERN: [char; 4] = ['t', 'r', 'u', 'e'];
21/// Pattern to match for false
22const FALSE_PATTERN: [char; 5] = ['f', 'a', 'l', 's', 'e'];
23
24/// Enumeration of valid JSON tokens
25#[derive(Debug, Clone, PartialEq)]
26pub enum Token {
27    StartObject,
28    EndObject,
29    StartArray,
30    EndArray,
31    Colon,
32    Comma,
33    Str(String),
34    Float(f64),
35    Integer(i64),
36    Null,
37    Boolean(bool),
38    EndOfInput,
39}
40
41impl Display for Token {
42    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
43        match self {
44            Token::StartObject => write!(f, "StartObject"),
45            Token::EndObject => write!(f, "EndObject"),
46            Token::StartArray => write!(f, "StartArray"),
47            Token::EndArray => write!(f, "EndArray"),
48            Token::Colon => write!(f, "Colon"),
49            Token::Comma => write!(f, "Comma"),
50            Token::Str(str) => write!(f, "String(\"{}\")", str),
51            Token::Float(num) => write!(f, "Float({})", num),
52            Token::Integer(num) => write!(f, "Integer({})", num),
53            Token::Null => write!(f, "Null"),
54            Token::Boolean(bool) => write!(f, "Boolean({})", bool),
55            Token::EndOfInput => write!(f, "EndOfInput"),
56        }
57    }
58}
59
60/// A packed token consists of a [Token] and the [Span] associated with it
61pub type PackedToken<'a> = (Token, Span);
62
63/// Convenience macro for packing tokens along with their positional information
64macro_rules! packed_token {
65    ($t:expr, $s:expr, $e:expr) => {
66        Ok(($t, Span { start: $s, end: $e }))
67    };
68    ($t:expr, $s:expr) => {
69        Ok(($t, Span { start: $s, end: $s }))
70    };
71}
72
73macro_rules! match_zero {
74    () => {
75        '0'
76    };
77}
78
79macro_rules! match_minus {
80    () => {
81        '-'
82    };
83}
84
85macro_rules! match_plus_minus {
86    () => {
87        '+' | '-'
88    };
89}
90
91macro_rules! match_digit {
92    () => {
93        '0'..='9'
94    };
95}
96
97macro_rules! match_non_zero_digit {
98    () => {
99        '1'..='9'
100    };
101}
102
103macro_rules! match_exponent {
104    () => {
105        'e' | 'E'
106    };
107}
108
109macro_rules! match_period {
110    () => {
111        '.'
112    };
113}
114
115macro_rules! match_numeric_terminator {
116    () => {
117        ']' | '}' | ','
118    };
119}
120
121macro_rules! match_escape {
122    () => {
123        '\\'
124    };
125}
126
127macro_rules! match_escape_non_unicode_suffix {
128    () => {
129        'n' | 't' | 'r' | '\\' | '/' | 'b' | 'f' | '\"'
130    };
131}
132
133macro_rules! match_escape_unicode_suffix {
134    () => {
135        'u'
136    };
137}
138
139macro_rules! match_quote {
140    () => {
141        '\"'
142    };
143}
144
145macro_rules! match_newline {
146    () => {
147        '\n'
148    };
149}
150
151pub struct Lexer<'a> {
152    /// Input coordinate state
153    input: LexerInput<'a>,
154}
155
156impl<'a> Lexer<'a> {
157    pub fn new(chars: &'a mut impl Iterator<Item = char>) -> Self {
158        Lexer {
159            input: LexerInput::new(chars),
160        }
161    }
162
163    /// Get the front of the input
164    fn front(&self) -> Option<CharWithCoords> {
165        self.input.front()
166    }
167
168    /// Get the back of the input
169    fn back(&self) -> Option<CharWithCoords> {
170        self.input.back()
171    }
172
173    /// Grab the front character
174    fn front_char(&self) -> char {
175        self.input.front().unwrap().ch
176    }
177
178    /// Grab the back character
179    fn back_char(&self) -> char {
180        self.input.back().unwrap().ch
181    }
182
183    /// Grab the front input coordinates
184    fn front_coords(&self) -> Coords {
185        self.input.front().unwrap().coords
186    }
187
188    /// Grab the back input coordinates
189    fn back_coords(&self) -> Coords {
190        self.input.back().unwrap().coords
191    }
192
193    /// Grab the current absolute input coordinates
194    fn absolute_position(&self) -> Coords {
195        self.input.position()
196    }
197
198    /// Advance the input by one
199    fn advance(&mut self, skip_whitespace: bool) -> ParserResult<()> {
200        self.input.advance(skip_whitespace)
201    }
202
203    /// Advance the input by n
204    fn advance_n(&mut self, n: usize, skip_whitespace: bool) -> ParserResult<()> {
205        self.input.advance_n(n, skip_whitespace)
206    }
207
208    /// Grab the current input string
209    fn current_string(&mut self) -> String {
210        self.input.buffer_as_string_with_span().str
211    }
212
213    /// Grab the current input character array
214    fn current_chars(&mut self) -> Vec<char> {
215        self.input.buffer_as_char_array()
216    }
217
218    /// Grab the current input byte array
219    fn current_bytes(&mut self) -> Vec<u8> {
220        self.input.buffer_as_byte_array()
221    }
222
223    /// Consume the next [Token] from the input
224    pub fn consume(&mut self) -> ParserResult<PackedToken> {
225        self.input.clear();
226        match self.advance(true) {
227            Ok(_) => match self.input.front() {
228                Some(CharWithCoords { ch: '{', coords }) => {
229                    packed_token!(Token::StartObject, coords)
230                }
231                Some(CharWithCoords { ch: '}', coords }) => packed_token!(Token::EndObject, coords),
232                Some(CharWithCoords { ch: '[', coords }) => {
233                    packed_token!(Token::StartArray, coords)
234                }
235                Some(CharWithCoords { ch: ']', coords }) => packed_token!(Token::EndArray, coords),
236                Some(CharWithCoords { ch: ':', coords }) => packed_token!(Token::Colon, coords),
237                Some(CharWithCoords { ch: ',', coords }) => packed_token!(Token::Comma, coords),
238                Some(CharWithCoords { ch: '\"', coords }) => self.match_string(),
239                Some(CharWithCoords { ch: 'n', coords }) => self.match_null(),
240                Some(CharWithCoords { ch: 't', coords }) => self.match_true(),
241                Some(CharWithCoords { ch: 'f', coords }) => self.match_false(),
242                Some(CharWithCoords { ch: '-', coords }) => self.match_number(),
243                Some(CharWithCoords { ch: d, coords }) if d.is_ascii_digit() => self.match_number(),
244                Some(CharWithCoords { ch, coords }) => lexer_error!(
245                    ParserErrorDetails::InvalidCharacter(ch.clone()),
246                    coords.clone()
247                ),
248                None => lexer_error!(ParserErrorDetails::EndOfInput),
249            },
250            Err(err) => match err.details {
251                ParserErrorDetails::EndOfInput => {
252                    packed_token!(Token::EndOfInput, self.input.position())
253                }
254                _ => match err.coords {
255                    Some(coords) => lexer_error!(err.details, coords),
256                    None => lexer_error!(err.details),
257                },
258            },
259        }
260    }
261
262    /// Match on a valid Json string.
263    fn match_string(&mut self) -> ParserResult<PackedToken> {
264        loop {
265            match self.advance(false) {
266                Ok(_) => match self.front_char() {
267                    match_escape!() => match self.input.advance(false) {
268                        Ok(_) => match self.front_char() {
269                            match_escape_non_unicode_suffix!() => (),
270                            match_escape_unicode_suffix!() => self.check_unicode_sequence()?,
271                            _ => {
272                                return lexer_error!(
273                                    ParserErrorDetails::InvalidEscapeSequence(
274                                        self.current_string()
275                                    ),
276                                    self.back_coords()
277                                );
278                            }
279                        },
280                        Err(err) => {
281                            return lexer_error!(err.details, err.coords.unwrap());
282                        }
283                    },
284                    match_quote!() => {
285                        return packed_token!(
286                            Token::Str(self.current_string()),
287                            self.back_coords(),
288                            self.front_coords()
289                        );
290                    }
291                    _ => (),
292                },
293                Err(err) => return lexer_error!(err.details, err.coords.unwrap()),
294            }
295        }
296    }
297
298    /// Check for a valid unicode escape sequence of the form '\uXXXX'
299    fn check_unicode_sequence(&mut self) -> ParserResult<()> {
300        let start_position = self.absolute_position();
301        for i in 1..=4 {
302            match self.advance(false) {
303                Ok(_) => {
304                    if !self.front_char().is_ascii_hexdigit() {
305                        return lexer_error!(
306                            ParserErrorDetails::InvalidUnicodeEscapeSequence(self.current_string()),
307                            start_position
308                        );
309                    }
310                }
311                Err(e) => {
312                    return lexer_error!(ParserErrorDetails::EndOfInput, self.absolute_position());
313                }
314            }
315        }
316        Ok(())
317    }
318
319    /// Match on a valid Json number representation, taking into account valid prefixes allowed
320    /// within Json but discarding anything that may be allowed by a more general representations.
321    ///
322    /// Few rules are applied here, leading to different error conditions:
323    /// - All representations must have a valid prefix
324    /// - Only a single exponent can be specified
325    /// - Only a single decimal point can be specified
326    /// - Exponents must be well-formed
327    /// - An non-exponent alphabetic found in the representation will result in an error
328    /// - Numbers can be terminated by commas, brackets and whitespace only (end of pair, end of array)
329    fn match_number(&mut self) -> ParserResult<PackedToken> {
330        let mut have_exponent = false;
331        let mut have_decimal = false;
332
333        match self.match_valid_number_prefix() {
334            Ok(integral) => {
335                have_decimal = !integral;
336                loop {
337                    match self.advance(false) {
338                        Ok(_) => match self.front_char() {
339                            match_digit!() => (),
340                            match_exponent!() => {
341                                if !have_exponent {
342                                    self.check_following_exponent()?;
343                                    have_exponent = true;
344                                } else {
345                                    return lexer_error!(
346                                        ParserErrorDetails::InvalidNumericRepresentation(
347                                            self.current_string()
348                                        ),
349                                        self.back_coords()
350                                    );
351                                }
352                            }
353                            match_period!() => {
354                                if !have_decimal {
355                                    have_decimal = true;
356                                } else {
357                                    return lexer_error!(
358                                        ParserErrorDetails::InvalidNumericRepresentation(
359                                            self.current_string()
360                                        ),
361                                        self.back_coords()
362                                    );
363                                }
364                            }
365                            match_numeric_terminator!() => {
366                                self.input.pushback();
367                                break;
368                            }
369                            ch if ch.is_ascii_whitespace() => {
370                                self.input.pushback();
371                                break;
372                            }
373                            ch if ch.is_alphabetic() => {
374                                return lexer_error!(
375                                    ParserErrorDetails::InvalidNumericRepresentation(
376                                        self.current_string()
377                                    ),
378                                    self.back_coords()
379                                );
380                            }
381                            _ => {
382                                return lexer_error!(
383                                    ParserErrorDetails::InvalidNumericRepresentation(
384                                        self.current_string()
385                                    ),
386                                    self.back_coords()
387                                );
388                            }
389                        },
390                        Err(err) => {
391                            return match err.coords {
392                                Some(coords) => lexer_error!(err.details, coords),
393                                None => lexer_error!(err.details),
394                            };
395                        }
396                    }
397                }
398            }
399            Err(err) => {
400                return match err.coords {
401                    Some(coords) => lexer_error!(err.details, coords),
402                    None => lexer_error!(err.details),
403                }
404            }
405        }
406
407        self.parse_numeric(!have_decimal)
408    }
409
410    fn check_following_exponent(&mut self) -> ParserResult<()> {
411        self.advance(false).and_then(|_| {
412            return match self.front_char() {
413                match_plus_minus!() => Ok(()),
414                _ => lexer_error!(
415                    ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
416                    self.absolute_position()
417                ),
418            };
419        })
420    }
421
422    #[cfg(not(feature = "mixed_numerics"))]
423    fn parse_numeric(
424        &mut self,
425        integral: bool,
426        start_coords: Coords,
427        end_coords: Coords,
428    ) -> ParserResult<PackedToken> {
429        packed_token!(
430            Token::Float(fast_float::parse(self.input.buffer_as_bytes()).unwrap()),
431            back_input_coords!(),
432            front_input_coords!()
433        )
434    }
435
436    #[cfg(feature = "mixed_numerics")]
437    fn parse_numeric(&mut self, integral: bool) -> ParserResult<PackedToken> {
438        if integral {
439            packed_token!(
440                Token::Integer(lexical::parse(self.input.buffer_as_byte_array()).unwrap()),
441                self.back_coords(),
442                self.front_coords()
443            )
444        } else {
445            packed_token!(
446                Token::Float(fast_float::parse(self.input.buffer_as_byte_array()).unwrap()),
447                self.back_coords(),
448                self.front_coords()
449            )
450        }
451    }
452
453    /// Check that a numeric representation is prefixed correctly.
454    ///
455    /// A few rules here:
456    /// - A leading minus must be followed by a digit
457    /// - A leading minus must be followed by at most one zero before a period
458    /// - Any number > zero can't have a leading zero in the representation
459    fn match_valid_number_prefix(&mut self) -> ParserResult<bool> {
460        let ch = self.back_char();
461        assert!(ch.is_ascii_digit() || ch == '-');
462        match ch {
463            match_minus!() => self
464                .input
465                .advance(false)
466                .and_then(|_| self.check_following_minus()),
467            match_zero!() => self
468                .input
469                .advance(false)
470                .and_then(|_| self.check_following_zero()),
471            _ => Ok(true),
472        }
473    }
474
475    fn check_following_zero(&mut self) -> ParserResult<bool> {
476        match self.front_char() {
477            match_period!() => Ok(false),
478            match_digit!() => lexer_error!(
479                ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
480                self.back_coords()
481            ),
482            match_newline!() => {
483                self.input.pushback();
484                Ok(true)
485            }
486            _ => {
487                self.input.pushback();
488                Ok(true)
489            }
490        }
491    }
492
493    fn check_following_minus(&mut self) -> ParserResult<bool> {
494        match self.front_char() {
495            match_non_zero_digit!() => Ok(true),
496            match_zero!() => self.advance(false).and_then(|_| {
497                if self.front_char() != '.' {
498                    return lexer_error!(
499                        ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
500                        self.back_coords()
501                    );
502                }
503                Ok(false)
504            }),
505            match_newline!() => {
506                self.input.pushback();
507                Ok(true)
508            }
509            _ => lexer_error!(
510                ParserErrorDetails::InvalidNumericRepresentation(self.current_string()),
511                self.back_coords()
512            ),
513        }
514    }
515
516    /// Match on a null token
517    fn match_null(&mut self) -> ParserResult<PackedToken> {
518        self.input.advance_n(3, false).and_then(|_| {
519            if self.current_chars() == NULL_PATTERN {
520                packed_token!(Token::Null, self.back_coords(), self.front_coords())
521            } else {
522                lexer_error!(
523                    ParserErrorDetails::MatchFailed(
524                        String::from_iter(NULL_PATTERN.iter()),
525                        self.current_string()
526                    ),
527                    self.back_coords()
528                )
529            }
530        })
531    }
532
533    /// Match on a true token
534    fn match_true(&mut self) -> ParserResult<PackedToken> {
535        self.advance_n(3, false).and_then(|_| {
536            if self.current_chars() == TRUE_PATTERN {
537                packed_token!(
538                    Token::Boolean(true),
539                    self.back_coords(),
540                    self.front_coords()
541                )
542            } else {
543                lexer_error!(
544                    ParserErrorDetails::MatchFailed(
545                        String::from_iter(TRUE_PATTERN.iter()),
546                        self.current_string()
547                    ),
548                    self.back_coords()
549                )
550            }
551        })
552    }
553
554    /// Match on a false token
555    fn match_false(&mut self) -> ParserResult<PackedToken> {
556        self.advance_n(4, false).and_then(|_| {
557            if self.current_chars() == FALSE_PATTERN {
558                packed_token!(
559                    Token::Boolean(false),
560                    self.back_coords(),
561                    self.front_coords()
562                )
563            } else {
564                lexer_error!(
565                    ParserErrorDetails::MatchFailed(
566                        String::from_iter(FALSE_PATTERN.iter()),
567                        self.current_string()
568                    ),
569                    self.back_coords()
570                )
571            }
572        })
573    }
574}
575
576#[cfg(test)]
577mod tests {
578    use std::env;
579    use std::fs::File;
580    use std::io::{BufRead, BufReader};
581    use std::time::Instant;
582
583    use chisel_decoders::utf8::Utf8Decoder;
584
585    use crate::coords::Span;
586    use crate::lexer::lexer_core::{Lexer, PackedToken, Token};
587    use crate::results::{ParserError, ParserResult};
588    use crate::{lines_from_relative_file, reader_from_bytes};
589
590    #[test]
591    fn should_parse_basic_tokens() {
592        let mut reader = reader_from_bytes!("{}[],:");
593        let mut decoder = Utf8Decoder::new(&mut reader);
594        let mut lexer = Lexer::new(&mut decoder);
595        let mut tokens: Vec<Token> = vec![];
596        let mut spans: Vec<Span> = vec![];
597        for _ in 1..=7 {
598            let token = lexer.consume().unwrap();
599            tokens.push(token.0);
600            spans.push(token.1);
601        }
602        assert_eq!(
603            tokens,
604            [
605                Token::StartObject,
606                Token::EndObject,
607                Token::StartArray,
608                Token::EndArray,
609                Token::Comma,
610                Token::Colon,
611                Token::EndOfInput
612            ]
613        );
614    }
615
616    #[test]
617    fn should_parse_null_and_booleans() {
618        let mut reader = reader_from_bytes!("null true    falsetruefalse");
619        let mut decoder = Utf8Decoder::new(&mut reader);
620        let mut lexer = Lexer::new(&mut decoder);
621        let mut tokens: Vec<Token> = vec![];
622        let mut spans: Vec<Span> = vec![];
623        for _ in 1..=6 {
624            let token = lexer.consume().unwrap();
625            tokens.push(token.0);
626            spans.push(token.1);
627        }
628        assert_eq!(
629            tokens,
630            [
631                Token::Null,
632                Token::Boolean(true),
633                Token::Boolean(false),
634                Token::Boolean(true),
635                Token::Boolean(false),
636                Token::EndOfInput
637            ]
638        );
639    }
640
641    #[test]
642    fn should_parse_strings() {
643        let lines = lines_from_relative_file!("fixtures/utf-8/strings.txt");
644        for l in lines.flatten() {
645            if !l.is_empty() {
646                let mut reader = reader_from_bytes!(l);
647                let mut decoder = Utf8Decoder::new(&mut reader);
648                let mut lexer = Lexer::new(&mut decoder);
649                let token = lexer.consume().unwrap();
650                match token.0 {
651                    Token::Str(str) => {
652                        assert_eq!(str, l)
653                    }
654                    _ => panic!(),
655                }
656            }
657        }
658    }
659
660    #[test]
661    fn should_report_correct_error_char_position() {
662        let mut reader = reader_from_bytes!("{\"abc\" : \nd}");
663        let mut decoder = Utf8Decoder::new(&mut reader);
664        let mut lexer = Lexer::new(&mut decoder);
665        let mut results = vec![];
666        for _ in 0..4 {
667            results.push(lexer.consume())
668        }
669        assert!(&results[3].is_err());
670        let coords = results[3].clone().err().unwrap().coords.unwrap();
671        assert_eq!(coords.absolute, 11);
672        assert_eq!(coords.line, 2)
673    }
674
675    #[test]
676    fn should_parse_numerics() {
677        let start = Instant::now();
678        let lines = lines_from_relative_file!("fixtures/utf-8/numbers.txt");
679        for l in lines.flatten() {
680            if !l.is_empty() {
681                println!("Parsing {}", l);
682                let mut reader = reader_from_bytes!(l);
683                let mut decoder = Utf8Decoder::new(&mut reader);
684                let mut lexer = Lexer::new(&mut decoder);
685                let token = lexer.consume().unwrap();
686                match token.0 {
687                    Token::Integer(_) => {
688                        assert_eq!(
689                            token.0,
690                            Token::Integer(l.replace(',', "").parse::<i64>().unwrap())
691                        );
692                    }
693                    Token::Float(_) => {
694                        assert_eq!(
695                            token.0,
696                            Token::Float(fast_float::parse(l.replace(',', "")).unwrap())
697                        );
698                    }
699                    _ => panic!(),
700                }
701            }
702        }
703        println!("Parsed numerics in {:?}", start.elapsed());
704    }
705
706    #[test]
707    fn should_correctly_handle_invalid_numbers() {
708        let lines = lines_from_relative_file!("fixtures/utf-8/invalid_numbers.txt");
709        for l in lines.flatten() {
710            if !l.is_empty() {
711                let mut reader = reader_from_bytes!(l);
712                let mut decoder = Utf8Decoder::new(&mut reader);
713                let mut lexer = Lexer::new(&mut decoder);
714                let token = lexer.consume();
715                assert!(token.is_err());
716            }
717        }
718    }
719
720    #[test]
721    fn should_correctly_identity_dodgy_strings() {
722        let lines = lines_from_relative_file!("fixtures/utf-8/dodgy_strings.txt");
723        for l in lines.flatten() {
724            if !l.is_empty() {
725                let mut reader = reader_from_bytes!(l);
726                let mut decoder = Utf8Decoder::new(&mut reader);
727                let mut lexer = Lexer::new(&mut decoder);
728                let mut error_token: Option<ParserError> = None;
729                loop {
730                    let token = lexer.consume();
731                    match token {
732                        Ok(packed) => {
733                            if packed.0 == Token::EndOfInput {
734                                break;
735                            }
736                        }
737                        Err(err) => {
738                            error_token = Some(err.clone());
739                            println!("Dodgy string found: {} : {}", l, err.coords.unwrap());
740                            break;
741                        }
742                    }
743                }
744                assert!(error_token.is_some());
745            }
746        }
747    }
748
749    #[test]
750    fn should_correctly_report_errors_for_booleans() {
751        let mut reader = reader_from_bytes!("true farse");
752        let mut decoder = Utf8Decoder::new(&mut reader);
753        let mut lexer = Lexer::new(&mut decoder);
754        let mut results: Vec<ParserResult<PackedToken>> = vec![];
755        for _ in 1..=2 {
756            results.push(lexer.consume());
757        }
758
759        // check that we've got the correct types of results
760        assert!(results[0].is_ok());
761        assert!(results[1].is_err());
762
763        // check that we've located a boolean in the correct position
764        if results[0].is_ok() {
765            match &results[0] {
766                Ok(packed) => {
767                    assert_eq!((*packed).1.start.column, 1)
768                }
769                Err(_) => {}
770            }
771        }
772
773        // check that the dodgy boolean has been picked up at the correct location
774        if results[1].is_err() {
775            match &results[1] {
776                Ok(_) => {}
777                Err(err) => {
778                    assert_eq!(err.coords.unwrap().column, 6)
779                }
780            }
781        }
782
783        println!("Parse error: {:?}", results[1]);
784    }
785}