chisel_lexers/json/
lexer.rs

1//!
2#![allow(unused_assignments)]
3#![allow(unused_variables)]
4#![allow(unreachable_code)]
5#![allow(dead_code)]
6#![allow(unused_imports)]
7
8use std::fmt::{Display, Formatter};
9
10use crate::json::numerics::LazyNumeric;
11use chisel_common::char::coords::Coords;
12use chisel_common::char::span::Span;
13
14use crate::json::tokens::{PackedToken, Token};
15use crate::scanner::{CharWithCoords, Scanner};
16
17/// JSON lexer backend result type
18pub type LexerResult<T> = Result<T, LexerError>;
19
20/// A global enumeration of error codes
21#[derive(Debug, Clone, PartialEq)]
22pub enum LexerErrorDetails {
23    /// An invalid file has been specified.  It might not exist, or might not be accessible
24    InvalidFile,
25    /// We can't parse nothing.
26    ZeroLengthInput,
27    /// End of input has been reached. This is used as a stopping condition at various points.
28    EndOfInput,
29    /// If pulling bytes from an underlying stream (or [BufRead]) of some description, and an
30    /// error occurs, this will be returned.
31    StreamFailure,
32    /// Dodgy UTF8 has been found in the input.
33    NonUtf8InputDetected,
34    /// Edge case error condition. This means that something has gone horribly wrong with the
35    /// parse.
36    UnexpectedToken(Token),
37    /// KV pair is expected but not detected.
38    PairExpected,
39    /// Supplied JSON doesn't have an object or array as a root object.
40    InvalidRootObject,
41    /// The parse of an object has failed.
42    InvalidObject,
43    /// The parse of an array has failed.
44    InvalidArray,
45    /// An invalid character has been detected within the input.
46    InvalidCharacter(char),
47    /// Whilst looking for a literal string token (null, true, false) a match couldn't be found
48    MatchFailed(String, String),
49    /// A number has been found with an incorrect string representation.
50    InvalidNumericRepresentation(String),
51    /// An invalid escape sequence has been found within the input.
52    InvalidEscapeSequence(String),
53    /// An invalid unicode escape sequence (\uXXX) has been found within the input.
54    InvalidUnicodeEscapeSequence(String),
55}
56
57impl Display for LexerErrorDetails {
58    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
59        match self {
60            LexerErrorDetails::InvalidFile => write!(f, "invalid file specified"),
61            LexerErrorDetails::ZeroLengthInput => write!(f, "zero length input"),
62            LexerErrorDetails::EndOfInput => write!(f, "end of input reached"),
63            LexerErrorDetails::StreamFailure => write!(f, "failure in the underlying stream"),
64            LexerErrorDetails::NonUtf8InputDetected => write!(f, "non-UTF8 input"),
65            LexerErrorDetails::UnexpectedToken(token) => {
66                write!(f, "unexpected token found: {}", token)
67            }
68            LexerErrorDetails::PairExpected => {
69                write!(f, "pair expected, something else was found")
70            }
71            LexerErrorDetails::InvalidRootObject => write!(f, "invalid JSON"),
72            LexerErrorDetails::InvalidObject => write!(f, "invalid object"),
73            LexerErrorDetails::InvalidArray => write!(f, "invalid array"),
74            LexerErrorDetails::InvalidCharacter(ch) => write!(f, "invalid character: \'{}\'", ch),
75            LexerErrorDetails::MatchFailed(first, second) => write!(
76                f,
77                "a match failed. Looking for \"{}\", found \"{}\"",
78                first, second
79            ),
80            LexerErrorDetails::InvalidNumericRepresentation(repr) => {
81                write!(f, "invalid number representation: \"{}\"", repr)
82            }
83            LexerErrorDetails::InvalidEscapeSequence(seq) => {
84                write!(f, "invalid escape sequence: \"{}\"", seq)
85            }
86            LexerErrorDetails::InvalidUnicodeEscapeSequence(seq) => {
87                write!(f, "invalid unicode escape sequence: \"{}\"", seq)
88            }
89        }
90    }
91}
92
93/// The general error structure
94#[derive(Debug, Clone)]
95pub struct LexerError {
96    /// The global error code for the error
97    pub details: LexerErrorDetails,
98    /// Parser [Coords]
99    pub coords: Option<Coords>,
100}
101
102impl Display for LexerError {
103    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
104        if self.coords.is_some() {
105            write!(
106                f,
107                "details: {}, coords: {}",
108                self.details,
109                self.coords.unwrap()
110            )
111        } else {
112            write!(f, "details: {}", self.details)
113        }
114    }
115}
116
117/// Wrap a [LexerError] in a top level [Err]
118macro_rules! wrapped_lexer_error {
119    ($details: expr, $coords : expr) => {
120        Err(LexerError {
121            details: $details,
122            coords: Some($coords),
123        })
124    };
125    ($details: expr) => {
126        Err(LexerError {
127            details: $details,
128            coords: None,
129        })
130    };
131}
132
133/// Create a [LexerError]
134macro_rules! lexer_error {
135    ($details: expr, $coords: expr) => {
136        LexerError {
137            details: $details,
138            coords: Some($coords),
139        }
140    };
141    ($details: expr) => {
142        LexerError {
143            details: $details,
144            coords: None,
145        }
146    };
147}
148
149/// Pattern to match for null
150const NULL_ASCII: [u8; 4] = [0x6e, 0x75, 0x6c, 0x6c];
151/// Pattern to match for true
152const TRUE_ASCII: [u8; 4] = [0x74, 0x72, 0x75, 0x65];
153/// Pattern to match for false
154const FALSE_ASCII: [u8; 5] = [0x66, 0x61, 0x6c, 0x73, 0x65];
155
156macro_rules! packed_token {
157    ($t:expr, $s:expr, $e:expr) => {
158        Ok(($t, Span { start: $s, end: $e }))
159    };
160    ($t:expr, $s:expr) => {
161        Ok(($t, Span { start: $s, end: $s }))
162    };
163}
164
165/// Pattern matching macro
166macro_rules! match_zero {
167    () => {
168        '0'
169    };
170}
171
172/// Pattern matching macro
173macro_rules! match_minus {
174    () => {
175        '-'
176    };
177}
178
179/// Pattern matching macro
180macro_rules! match_plus_minus {
181    () => {
182        '+' | '-'
183    };
184}
185
186/// Pattern matching macro
187macro_rules! match_digit {
188    () => {
189        '0'..='9'
190    };
191}
192
193/// Pattern matching macro
194macro_rules! match_non_zero_digit {
195    () => {
196        '1'..='9'
197    };
198}
199
200/// Pattern matching macro
201macro_rules! match_exponent {
202    () => {
203        'e' | 'E'
204    };
205}
206
207/// Pattern matching macro
208macro_rules! match_period {
209    () => {
210        '.'
211    };
212}
213
214/// Pattern matching macro
215macro_rules! match_numeric_terminator {
216    () => {
217        ']' | '}' | ','
218    };
219}
220
221/// Pattern matching macro
222macro_rules! match_escape {
223    () => {
224        '\\'
225    };
226}
227
228/// Pattern matching macro
229macro_rules! match_escape_non_unicode_suffix {
230    () => {
231        'n' | 't' | 'r' | '\\' | '/' | 'b' | 'f' | '\"'
232    };
233}
234
235/// Pattern matching macro
236macro_rules! match_escape_unicode_suffix {
237    () => {
238        'u'
239    };
240}
241
242/// Pattern matching macro
243macro_rules! match_quote {
244    () => {
245        '\"'
246    };
247}
248
249/// Pattern matching macro
250macro_rules! match_newline {
251    () => {
252        '\n'
253    };
254}
255
256/// Given a source and target to compare, adjust a starting [Coord] so that it points to
257/// the exact location where they fail to match
258#[inline]
259fn adjusted_error_coords(start_coords: &Coords, source: &[u8], target: &[u8]) -> Coords {
260    let mut err_coords = Coords::from_coords(start_coords);
261    for i in 0..=target.len() {
262        if !source[i].is_ascii_whitespace() {
263            if source[i] != target[i] {
264                break;
265            } else {
266                err_coords.increment();
267            }
268        } else {
269            err_coords.increment();
270        }
271    }
272    err_coords
273}
274
275pub struct Lexer<'a> {
276    /// Input coordinate state
277    input: Scanner<'a>,
278}
279
280impl<'a> Lexer<'a> {
281    pub fn new(chars: &'a mut impl Iterator<Item = char>) -> Self {
282        Lexer {
283            input: Scanner::new(chars),
284        }
285    }
286
287    /// Get the front of the input
288    fn front(&self) -> Option<CharWithCoords> {
289        self.input.front()
290    }
291
292    /// Get the back of the input
293    fn back(&self) -> Option<CharWithCoords> {
294        self.input.back()
295    }
296
297    /// Grab the front character
298    #[inline]
299    fn front_char(&self) -> char {
300        self.input.front().unwrap().ch
301    }
302
303    /// Grab the back character
304    #[inline]
305    fn back_char(&self) -> char {
306        self.input.back().unwrap().ch
307    }
308
309    /// Grab the front input coordinates
310    #[inline]
311    fn front_coords(&self) -> Coords {
312        self.input.front().unwrap().coords
313    }
314
315    /// Grab the back input coordinates
316    #[inline]
317    fn back_coords(&self) -> Coords {
318        self.input.back().unwrap().coords
319    }
320
321    /// Grab the current absolute input coordinates
322    #[inline]
323    fn absolute_position(&self) -> Coords {
324        self.input.position()
325    }
326
327    /// Advance the input by one
328    #[inline]
329    fn advance(&mut self, skip_whitespace: bool) -> LexerResult<()> {
330        self.input
331            .advance(skip_whitespace)
332            .map_err(|e| lexer_error!(LexerErrorDetails::EndOfInput))
333    }
334
335    /// Advance the input by n
336    fn advance_n(&mut self, n: usize, skip_whitespace: bool) -> LexerResult<()> {
337        self.input
338            .advance_n(n, skip_whitespace)
339            .map_err(|e| lexer_error!(LexerErrorDetails::EndOfInput))
340    }
341
342    /// Grab the current input string
343    #[inline]
344    fn current_string(&mut self) -> String {
345        self.input.buffer_as_string_with_span().str
346    }
347
348    /// Grab the current input character array
349    #[inline]
350    fn current_chars(&mut self) -> Vec<char> {
351        self.input.buffer_as_char_array()
352    }
353
354    /// Grab the current input byte array
355    #[inline]
356    fn current_bytes(&mut self) -> Vec<u8> {
357        self.input.buffer_as_byte_array()
358    }
359
360    /// Consume the next [Token] from the input and add it to the internal buffer
361    pub fn consume(&mut self) -> LexerResult<PackedToken> {
362        self.input.clear();
363        match self.advance(true) {
364            Ok(_) => match self.input.front() {
365                Some(CharWithCoords { ch: '{', coords }) => {
366                    packed_token!(Token::StartObject, coords)
367                }
368                Some(CharWithCoords { ch: '}', coords }) => packed_token!(Token::EndObject, coords),
369                Some(CharWithCoords { ch: '[', coords }) => {
370                    packed_token!(Token::StartArray, coords)
371                }
372                Some(CharWithCoords { ch: ']', coords }) => packed_token!(Token::EndArray, coords),
373                Some(CharWithCoords { ch: ':', coords }) => packed_token!(Token::Colon, coords),
374                Some(CharWithCoords { ch: ',', coords }) => packed_token!(Token::Comma, coords),
375                Some(CharWithCoords { ch: '\"', coords }) => self.match_string(),
376                Some(CharWithCoords { ch: 'n', coords }) => self.match_null(),
377                Some(CharWithCoords { ch: 't', coords }) => self.match_true(),
378                Some(CharWithCoords { ch: 'f', coords }) => self.match_false(),
379                Some(CharWithCoords { ch: '-', coords }) => self.match_number(),
380                Some(CharWithCoords { ch: d, coords }) if d.is_ascii_digit() => self.match_number(),
381                Some(CharWithCoords { ch, coords }) => wrapped_lexer_error!(
382                    LexerErrorDetails::InvalidCharacter(ch.clone()),
383                    coords.clone()
384                ),
385                None => {
386                    wrapped_lexer_error!(LexerErrorDetails::EndOfInput, self.absolute_position())
387                }
388            },
389            Err(err) => match err.details {
390                LexerErrorDetails::EndOfInput => {
391                    packed_token!(Token::EndOfInput, self.input.position())
392                }
393                _ => match err.coords {
394                    Some(coords) => wrapped_lexer_error!(err.details, coords),
395                    None => wrapped_lexer_error!(err.details, self.absolute_position()),
396                },
397            },
398        }
399    }
400
401    /// Match on a valid Json string.
402    #[inline]
403    fn match_string(&mut self) -> LexerResult<PackedToken> {
404        loop {
405            match self.advance(false) {
406                Ok(_) => match self.front_char() {
407                    match_escape!() => match self.input.advance(false) {
408                        Ok(_) => match self.front_char() {
409                            match_escape_non_unicode_suffix!() => (),
410                            match_escape_unicode_suffix!() => self.check_unicode_sequence()?,
411                            _ => {
412                                return wrapped_lexer_error!(
413                                    LexerErrorDetails::InvalidEscapeSequence(self.current_string()),
414                                    self.back_coords()
415                                );
416                            }
417                        },
418                        Err(err) => {
419                            return wrapped_lexer_error!(
420                                LexerErrorDetails::EndOfInput,
421                                err.coords.unwrap()
422                            );
423                        }
424                    },
425                    match_quote!() => {
426                        return packed_token!(
427                            Token::Str(self.current_string()),
428                            self.back_coords(),
429                            self.front_coords()
430                        );
431                    }
432                    _ => (),
433                },
434                Err(err) => {
435                    return match err.coords {
436                        Some(_) => {
437                            wrapped_lexer_error!(err.details, err.coords.unwrap())
438                        }
439                        None => wrapped_lexer_error!(err.details, self.absolute_position()),
440                    }
441                }
442            }
443        }
444    }
445
446    /// Check for a valid unicode escape sequence of the form '\uXXXX'
447    #[inline]
448    fn check_unicode_sequence(&mut self) -> LexerResult<()> {
449        let start_position = self.absolute_position();
450        for i in 1..=4 {
451            match self.advance(false) {
452                Ok(_) => {
453                    if !self.front_char().is_ascii_hexdigit() {
454                        return wrapped_lexer_error!(
455                            LexerErrorDetails::InvalidUnicodeEscapeSequence(self.current_string()),
456                            start_position
457                        );
458                    }
459                }
460                Err(e) => {
461                    return wrapped_lexer_error!(
462                        LexerErrorDetails::EndOfInput,
463                        self.absolute_position()
464                    );
465                }
466            }
467        }
468        Ok(())
469    }
470
471    /// Match on a valid Json number representation, taking into account valid prefixes allowed
472    /// within Json but discarding anything that may be allowed by a more general representations.
473    ///
474    /// Few rules are applied here, leading to different error conditions:
475    /// - All representations must have a valid prefix
476    /// - Only a single exponent can be specified
477    /// - Only a single decimal point can be specified
478    /// - Exponents must be well-formed
479    /// - An non-exponent alphabetic found in the representation will result in an error
480    /// - Numbers can be terminated by commas, brackets and whitespace only (end of pair, end of array)
481    #[inline]
482    fn match_number(&mut self) -> LexerResult<PackedToken> {
483        let mut have_exponent = false;
484        let mut have_decimal = false;
485
486        match self.match_valid_number_prefix() {
487            Ok(integral) => {
488                have_decimal = !integral;
489                loop {
490                    match self.advance(false) {
491                        Ok(_) => match self.front_char() {
492                            match_digit!() => (),
493                            match_exponent!() => {
494                                if !have_exponent {
495                                    self.check_following_exponent()?;
496                                    have_exponent = true;
497                                } else {
498                                    return wrapped_lexer_error!(
499                                        LexerErrorDetails::InvalidNumericRepresentation(
500                                            self.current_string()
501                                        ),
502                                        self.back_coords()
503                                    );
504                                }
505                            }
506                            match_period!() => {
507                                if !have_decimal {
508                                    have_decimal = true;
509                                } else {
510                                    return wrapped_lexer_error!(
511                                        LexerErrorDetails::InvalidNumericRepresentation(
512                                            self.current_string()
513                                        ),
514                                        self.back_coords()
515                                    );
516                                }
517                            }
518                            match_numeric_terminator!() => {
519                                self.input.pushback();
520                                break;
521                            }
522                            ch if ch.is_ascii_whitespace() => {
523                                self.input.pushback();
524                                break;
525                            }
526                            ch if ch.is_alphabetic() => {
527                                return wrapped_lexer_error!(
528                                    LexerErrorDetails::InvalidNumericRepresentation(
529                                        self.current_string()
530                                    ),
531                                    self.back_coords()
532                                );
533                            }
534                            _ => {
535                                return wrapped_lexer_error!(
536                                    LexerErrorDetails::InvalidNumericRepresentation(
537                                        self.current_string()
538                                    ),
539                                    self.back_coords()
540                                );
541                            }
542                        },
543                        Err(err) => {
544                            return match err.coords {
545                                Some(coords) => wrapped_lexer_error!(err.details, coords),
546                                None => wrapped_lexer_error!(err.details),
547                            };
548                        }
549                    }
550                }
551            }
552            Err(err) => {
553                return match err.coords {
554                    Some(coords) => wrapped_lexer_error!(err.details, coords),
555                    None => wrapped_lexer_error!(err.details),
556                };
557            }
558        }
559
560        self.parse_numeric(!have_decimal)
561    }
562
563    #[inline]
564    fn check_following_exponent(&mut self) -> LexerResult<()> {
565        self.advance(false).and_then(|_| {
566            return match self.front_char() {
567                match_plus_minus!() => Ok(()),
568                _ => wrapped_lexer_error!(
569                    LexerErrorDetails::InvalidNumericRepresentation(self.current_string()),
570                    self.absolute_position()
571                ),
572            };
573        })
574    }
575
576    /// We detect whether we have an integral or
577    /// floating point value, and parse differently (and emit different tokens) for each
578    #[cfg(not(feature = "lazy-numerics"))]
579    #[inline]
580    fn parse_numeric(&mut self, integral: bool) -> LexerResult<PackedToken> {
581        if integral {
582            packed_token!(
583                Token::Integer(lexical::parse(self.input.buffer_as_byte_array()).unwrap()),
584                self.back_coords(),
585                self.front_coords()
586            )
587        } else {
588            packed_token!(
589                Token::Float(fast_float::parse(self.input.buffer_as_byte_array()).unwrap()),
590                self.back_coords(),
591                self.front_coords()
592            )
593        }
594    }
595
596    #[cfg(feature = "lazy-numerics")]
597    #[inline]
598    fn parse_numeric(&mut self, integral: bool) -> LexerResult<PackedToken> {
599        packed_token!(
600            Token::LazyNumeric(LazyNumeric::new(
601                self.input.buffer_as_byte_array().as_slice()
602            )),
603            self.back_coords(),
604            self.front_coords()
605        )
606    }
607
608    /// Check that a numeric representation is prefixed correctly.
609    ///
610    /// A few rules here:
611    /// - A leading minus must be followed by a digit
612    /// - A leading minus must be followed by at most one zero before a period
613    /// - Any number > zero can't have a leading zero in the representation
614    #[inline]
615    fn match_valid_number_prefix(&mut self) -> LexerResult<bool> {
616        let ch = self.back_char();
617        assert!(ch.is_ascii_digit() || ch == '-');
618        match ch {
619            match_minus!() => self
620                .input
621                .advance(false)
622                .map_err(|e| lexer_error!(LexerErrorDetails::EndOfInput))
623                .and_then(|_| self.check_following_minus()),
624            match_zero!() => self
625                .input
626                .advance(false)
627                .map_err(|e| lexer_error!(LexerErrorDetails::EndOfInput))
628                .and_then(|_| self.check_following_zero()),
629            _ => Ok(true),
630        }
631    }
632
633    /// Check for valid characters following a zero
634    #[inline]
635    fn check_following_zero(&mut self) -> LexerResult<bool> {
636        match self.front_char() {
637            match_period!() => Ok(false),
638            match_digit!() => wrapped_lexer_error!(
639                LexerErrorDetails::InvalidNumericRepresentation(self.current_string()),
640                self.back_coords()
641            ),
642            match_newline!() => {
643                self.input.pushback();
644                Ok(true)
645            }
646            _ => {
647                self.input.pushback();
648                Ok(true)
649            }
650        }
651    }
652
653    /// Check for valid characters following a minus character
654    #[inline]
655    fn check_following_minus(&mut self) -> LexerResult<bool> {
656        match self.front_char() {
657            match_non_zero_digit!() => Ok(true),
658            match_zero!() => self.advance(false).and_then(|_| {
659                if self.front_char() != '.' {
660                    return wrapped_lexer_error!(
661                        LexerErrorDetails::InvalidNumericRepresentation(self.current_string()),
662                        self.back_coords()
663                    );
664                }
665                Ok(false)
666            }),
667            match_newline!() => {
668                self.input.pushback();
669                Ok(true)
670            }
671            _ => wrapped_lexer_error!(
672                LexerErrorDetails::InvalidNumericRepresentation(self.current_string()),
673                self.back_coords()
674            ),
675        }
676    }
677
678    /// Match on a null token
679    #[inline]
680    fn match_null(&mut self) -> LexerResult<PackedToken> {
681        self.input
682            .advance_n(3, false)
683            .map_err(|e| lexer_error!(LexerErrorDetails::EndOfInput, self.absolute_position()))
684            .and_then(|_| {
685                if self.current_bytes() == NULL_ASCII {
686                    packed_token!(Token::Null, self.back_coords(), self.front_coords())
687                } else {
688                    wrapped_lexer_error!(
689                        LexerErrorDetails::MatchFailed(String::from("null"), self.current_string()),
690                        adjusted_error_coords(
691                            &self.back_coords(),
692                            &self.current_bytes().as_slice(),
693                            &NULL_ASCII
694                        )
695                    )
696                }
697            })
698    }
699
700    /// Match on a true token
701    #[inline]
702    fn match_true(&mut self) -> LexerResult<PackedToken> {
703        self.advance_n(3, false)
704            .map_err(|e| lexer_error!(LexerErrorDetails::EndOfInput, self.absolute_position()))
705            .and_then(|_| {
706                if self.current_bytes() == TRUE_ASCII {
707                    packed_token!(
708                        Token::Boolean(true),
709                        self.back_coords(),
710                        self.front_coords()
711                    )
712                } else {
713                    wrapped_lexer_error!(
714                        LexerErrorDetails::MatchFailed(String::from("true"), self.current_string()),
715                        adjusted_error_coords(
716                            &self.back_coords(),
717                            &self.current_bytes().as_slice(),
718                            &TRUE_ASCII
719                        )
720                    )
721                }
722            })
723    }
724
725    /// Match on a false token
726    #[inline]
727    fn match_false(&mut self) -> LexerResult<PackedToken> {
728        self.advance_n(4, false)
729            .map_err(|e| lexer_error!(LexerErrorDetails::EndOfInput, self.absolute_position()))
730            .and_then(|_| {
731                if self.current_bytes() == FALSE_ASCII {
732                    packed_token!(
733                        Token::Boolean(false),
734                        self.back_coords(),
735                        self.front_coords()
736                    )
737                } else {
738                    wrapped_lexer_error!(
739                        LexerErrorDetails::MatchFailed(
740                            String::from("false"),
741                            self.current_string()
742                        ),
743                        adjusted_error_coords(
744                            &self.back_coords(),
745                            &self.current_bytes().as_slice(),
746                            &FALSE_ASCII
747                        )
748                    )
749                }
750            })
751    }
752}
753
754#[cfg(test)]
755mod tests {
756    use std::env;
757    use std::fs::File;
758    use std::io::{BufRead, BufReader};
759    use std::time::Instant;
760
761    use chisel_common::char::span::Span;
762    use chisel_common::{lines_from_relative_file, reader_from_bytes};
763    use chisel_decoders::utf8::Utf8Decoder;
764
765    use crate::json::lexer::{Lexer, LexerError, LexerResult};
766    use crate::json::tokens::{PackedToken, Token};
767
768    #[test]
769    fn should_report_position_of_eoi() {
770        let input = String::from("\"this is a test");
771        let mut reader = reader_from_bytes!(input);
772        let mut decoder = Utf8Decoder::new(&mut reader);
773        let mut lexer = Lexer::new(&mut decoder);
774        let result = lexer.consume();
775        match result {
776            Err(err) => {
777                assert!(err.coords.is_some());
778                assert_eq!(err.coords.unwrap().column, input.len())
779            }
780            _ => assert!(false),
781        }
782    }
783
784    #[test]
785    fn should_parse_basic_tokens() {
786        let mut reader = reader_from_bytes!("{}[],:");
787        let mut decoder = Utf8Decoder::new(&mut reader);
788        let mut lexer = Lexer::new(&mut decoder);
789        let mut tokens: Vec<Token> = vec![];
790        let mut spans: Vec<Span> = vec![];
791        for _ in 1..=7 {
792            let token = lexer.consume().unwrap();
793            tokens.push(token.0);
794            spans.push(token.1);
795        }
796        assert_eq!(
797            tokens,
798            [
799                Token::StartObject,
800                Token::EndObject,
801                Token::StartArray,
802                Token::EndArray,
803                Token::Comma,
804                Token::Colon,
805                Token::EndOfInput
806            ]
807        );
808    }
809
810    #[test]
811    fn should_parse_null_and_booleans() {
812        let mut reader = reader_from_bytes!("null true    falsetruefalse");
813        let mut decoder = Utf8Decoder::new(&mut reader);
814        let mut lexer = Lexer::new(&mut decoder);
815        let mut tokens: Vec<Token> = vec![];
816        let mut spans: Vec<Span> = vec![];
817        for _ in 1..=6 {
818            let token = lexer.consume().unwrap();
819            tokens.push(token.0);
820            spans.push(token.1);
821        }
822        assert_eq!(
823            tokens,
824            [
825                Token::Null,
826                Token::Boolean(true),
827                Token::Boolean(false),
828                Token::Boolean(true),
829                Token::Boolean(false),
830                Token::EndOfInput
831            ]
832        );
833    }
834
835    #[test]
836    fn should_parse_strings() {
837        let lines = lines_from_relative_file!("fixtures/utf-8/strings.txt");
838        for l in lines.flatten() {
839            if !l.is_empty() {
840                let mut reader = reader_from_bytes!(l);
841                let mut decoder = Utf8Decoder::new(&mut reader);
842                let mut lexer = Lexer::new(&mut decoder);
843                let token = lexer.consume().unwrap();
844                match token.0 {
845                    Token::Str(str) => {
846                        assert_eq!(str, l)
847                    }
848                    _ => panic!(),
849                }
850            }
851        }
852    }
853
854    #[test]
855    fn should_report_correct_error_char_position() {
856        let mut reader = reader_from_bytes!("{\"abc\" : \nd}");
857        let mut decoder = Utf8Decoder::new(&mut reader);
858        let mut lexer = Lexer::new(&mut decoder);
859        let mut results = vec![];
860        for _ in 0..4 {
861            results.push(lexer.consume())
862        }
863        assert!(&results[3].is_err());
864        let error = results[3].clone();
865        let coords = results[3].clone().err().unwrap().coords.unwrap();
866        assert_eq!(coords.absolute, 11);
867        assert_eq!(coords.line, 2)
868    }
869
870    #[test]
871    fn should_parse_numerics() {
872        let start = Instant::now();
873        let lines = lines_from_relative_file!("fixtures/utf-8/numbers.txt");
874        for l in lines.flatten() {
875            if !l.is_empty() {
876                println!("Parsing {}", l);
877                let mut reader = reader_from_bytes!(l);
878                let mut decoder = Utf8Decoder::new(&mut reader);
879                let mut lexer = Lexer::new(&mut decoder);
880                let token = lexer.consume().unwrap();
881                match token.0 {
882                    Token::Integer(_) => {
883                        assert_eq!(
884                            token.0,
885                            Token::Integer(l.replace(',', "").parse::<i64>().unwrap())
886                        );
887                    }
888                    Token::Float(_) => {
889                        assert_eq!(
890                            token.0,
891                            Token::Float(fast_float::parse(l.replace(',', "")).unwrap())
892                        );
893                    }
894                    Token::LazyNumeric(lazy) => {
895                        let value: f64 = lazy.into();
896                        assert_eq!(
897                            Token::Float(value),
898                            Token::Float(fast_float::parse(l.replace(',', "")).unwrap())
899                        );
900                    }
901                    _ => panic!(),
902                }
903            }
904        }
905        println!("Parsed numerics in {:?}", start.elapsed());
906    }
907
908    #[test]
909    fn should_correctly_handle_invalid_numbers() {
910        let lines = lines_from_relative_file!("fixtures/utf-8/invalid_numbers.txt");
911        for l in lines.flatten() {
912            if !l.is_empty() {
913                let mut reader = reader_from_bytes!(l);
914                let mut decoder = Utf8Decoder::new(&mut reader);
915                let mut lexer = Lexer::new(&mut decoder);
916                let token = lexer.consume();
917                assert!(token.is_err());
918            }
919        }
920    }
921
922    #[test]
923    fn should_correctly_identity_dodgy_strings() {
924        let lines = lines_from_relative_file!("fixtures/utf-8/dodgy_strings.txt");
925        for l in lines.flatten() {
926            if !l.is_empty() {
927                let mut reader = reader_from_bytes!(l);
928                let mut decoder = Utf8Decoder::new(&mut reader);
929                let mut lexer = Lexer::new(&mut decoder);
930                let mut error_token: Option<LexerError> = None;
931                loop {
932                    let token = lexer.consume();
933                    match token {
934                        Ok(packed) => {
935                            if packed.0 == Token::EndOfInput {
936                                break;
937                            }
938                        }
939                        Err(err) => {
940                            error_token = Some(err.clone());
941                            println!("Dodgy string found: {} : {}", l, err.coords.unwrap());
942                            break;
943                        }
944                    }
945                }
946                assert!(error_token.is_some());
947            }
948        }
949    }
950
951    #[test]
952    fn should_correctly_report_errors_for_booleans() {
953        let mut reader = reader_from_bytes!("true farse");
954        let mut decoder = Utf8Decoder::new(&mut reader);
955        let mut lexer = Lexer::new(&mut decoder);
956        let mut results: Vec<LexerResult<PackedToken>> = vec![];
957        for _ in 1..=2 {
958            results.push(lexer.consume());
959        }
960
961        // check that we've got the correct types of results
962        assert!(results[0].is_ok());
963        assert!(results[1].is_err());
964
965        // check that we've located a boolean in the correct position
966        if results[0].is_ok() {
967            match &results[0] {
968                Ok(packed) => {
969                    assert_eq!((*packed).1.start.column, 1)
970                }
971                Err(_) => {}
972            }
973        }
974
975        // check that the dodgy boolean has been picked up at the correct location
976        if results[1].is_err() {
977            match &results[1] {
978                Ok(_) => {}
979                Err(err) => {
980                    assert_eq!(err.coords.unwrap().column, 8)
981                }
982            }
983        }
984
985        println!("Parse error: {:?}", results[1]);
986    }
987}