chisel_json/parsers/
sax.rs

1//! The SAX parser
2use std::borrow::Cow;
3use std::fs::File;
4use std::io::{BufRead, BufReader};
5use std::path::Path;
6
7use crate::coords::Coords;
8use crate::lexer::decoders::{DecoderSelector, Encoding};
9use crate::lexer::lexer_core::{Lexer, Token};
10use crate::parsers::sax_events::{Event, Match};
11use crate::pointers::pointer::JsonPointer;
12use crate::results::{ParserError, ParserErrorDetails, ParserErrorSource, ParserResult};
13use crate::sax_parser_error;
14
15macro_rules! emit_event {
16    ($cb : expr, $m : expr, $span : expr, $path : expr) => {
17        $cb(&Event {
18            matched: $m,
19            span: $span,
20            pointer: Some(&$path),
21        })
22    };
23    ($cb : expr, $m : expr, $span : expr) => {
24        $cb(&Event {
25            matched: $m,
26            span: $span,
27            pointer: None,
28        })
29    };
30}
31
32/// Main JSON parser struct
33pub struct Parser {
34    decoders: DecoderSelector,
35    encoding: Encoding,
36}
37
38impl Default for Parser {
39    /// The default encoding is Utf-8
40    fn default() -> Self {
41        Self {
42            decoders: Default::default(),
43            encoding: Default::default(),
44        }
45    }
46}
47
48impl Parser {
49    /// Create a new instance of the parser using a specific [Encoding]
50    pub fn with_encoding(encoding: Encoding) -> Self {
51        Self {
52            decoders: Default::default(),
53            encoding,
54        }
55    }
56
57    pub fn parse_file<PathLike: AsRef<Path>, Callback>(
58        &self,
59        path: PathLike,
60        cb: &mut Callback,
61    ) -> ParserResult<()>
62    where
63        Callback: FnMut(&Event) -> ParserResult<()>,
64    {
65        match File::open(&path) {
66            Ok(f) => {
67                let mut reader = BufReader::new(f);
68                let mut chars = self.decoders.new_decoder(&mut reader, self.encoding);
69                self.parse(&mut chars, cb)
70            }
71            Err(_) => {
72                sax_parser_error!(ParserErrorDetails::InvalidFile)
73            }
74        }
75    }
76
77    pub fn parse_bytes<Callback>(&self, bytes: &[u8], cb: &mut Callback) -> ParserResult<()>
78    where
79        Callback: FnMut(&Event) -> ParserResult<()>,
80    {
81        if bytes.is_empty() {
82            return sax_parser_error!(ParserErrorDetails::ZeroLengthInput, Coords::default());
83        }
84        let mut reader = BufReader::new(bytes);
85        let mut chars = self.decoders.default_decoder(&mut reader);
86        self.parse(&mut chars, cb)
87    }
88
89    pub fn parse_str<Callback>(&self, str: &str, cb: &mut Callback) -> ParserResult<()>
90    where
91        Callback: FnMut(&Event) -> ParserResult<()>,
92    {
93        if str.is_empty() {
94            return sax_parser_error!(ParserErrorDetails::ZeroLengthInput, Coords::default());
95        }
96        let mut reader = BufReader::new(str.as_bytes());
97        let mut chars = self.decoders.default_decoder(&mut reader);
98        self.parse(&mut chars, cb)
99    }
100
101    /// Parse the contents extracted from an instance of [BufRead]
102    pub fn parse_buffer<Callback>(
103        &self,
104        buffer: &mut impl BufRead,
105        cb: &mut Callback,
106    ) -> ParserResult<()>
107    where
108        Callback: FnMut(&Event) -> ParserResult<()>,
109    {
110        let mut chars = self.decoders.default_decoder(buffer);
111        self.parse(&mut chars, cb)
112    }
113
114    pub fn parse<Callback>(
115        &self,
116        chars: &mut impl Iterator<Item = char>,
117        cb: &mut Callback,
118    ) -> ParserResult<()>
119    where
120        Callback: FnMut(&Event) -> ParserResult<()>,
121    {
122        let mut pointer = JsonPointer::root();
123        let mut lexer = Lexer::new(chars);
124        match lexer.consume()? {
125            (Token::StartObject, span) => {
126                emit_event!(cb, Match::StartOfInput, span)?;
127                emit_event!(cb, Match::StartObject, span, pointer)?;
128                self.parse_object(&mut lexer, &mut pointer, cb)
129            }
130            (Token::StartArray, span) => {
131                emit_event!(cb, Match::StartOfInput, span, pointer)?;
132                emit_event!(cb, Match::StartArray, span, pointer)?;
133                self.parse_array(&mut lexer, &mut pointer, cb)
134            }
135            (_, span) => {
136                sax_parser_error!(ParserErrorDetails::InvalidRootObject, span.start)
137            }
138        }
139    }
140
141    fn parse_value<Callback>(
142        &self,
143        lexer: &mut Lexer,
144        pointer: &mut JsonPointer,
145        cb: &mut Callback,
146    ) -> ParserResult<()>
147    where
148        Callback: FnMut(&Event) -> ParserResult<()>,
149    {
150        match lexer.consume()? {
151            (Token::StartObject, span) => {
152                emit_event!(cb, Match::StartObject, span, pointer)?;
153                self.parse_object(lexer, pointer, cb)
154            }
155            (Token::StartArray, span) => {
156                emit_event!(cb, Match::StartArray, span, pointer)?;
157                self.parse_array(lexer, pointer, cb)
158            }
159            (Token::Str(str), span) => {
160                emit_event!(cb, Match::String(Cow::Borrowed(&str)), span, pointer)
161            }
162            (Token::Float(value), span) => {
163                emit_event!(cb, Match::Float(value), span, pointer)
164            }
165            (Token::Integer(value), span) => {
166                emit_event!(cb, Match::Integer(value), span, pointer)
167            }
168            (Token::Boolean(value), span) => {
169                emit_event!(cb, Match::Boolean(value), span, pointer)
170            }
171            (Token::Null, span) => {
172                emit_event!(cb, Match::Null, span, pointer)
173            }
174            (token, span) => {
175                sax_parser_error!(ParserErrorDetails::UnexpectedToken(token), span.start)
176            }
177        }
178    }
179
180    /// An object is just a list of comma separated KV pairs
181    fn parse_object<Callback>(
182        &self,
183        lexer: &mut Lexer,
184        pointer: &mut JsonPointer,
185        cb: &mut Callback,
186    ) -> ParserResult<()>
187    where
188        Callback: FnMut(&Event) -> ParserResult<()>,
189    {
190        loop {
191            match lexer.consume()? {
192                (Token::Str(str), span) => {
193                    pointer.push_name(str.replace("\"", ""));
194                    emit_event!(cb, Match::ObjectKey(Cow::Borrowed(&str)), span, pointer)?;
195                    let should_be_colon = lexer.consume()?;
196                    match should_be_colon {
197                        (Token::Colon, _) => {
198                            self.parse_value(lexer, pointer, cb)?;
199                            pointer.pop();
200                        }
201                        (_, _) => {
202                            return sax_parser_error!(
203                                ParserErrorDetails::PairExpected,
204                                should_be_colon.1.start
205                            )
206                        }
207                    }
208                }
209                (Token::Comma, _) => (),
210                (Token::EndObject, span) => {
211                    return emit_event!(cb, Match::EndObject, span, pointer);
212                }
213                (_token, span) => {
214                    return sax_parser_error!(ParserErrorDetails::InvalidArray, span.start)
215                }
216            }
217        }
218    }
219
220    /// An array is just a list of comma separated values
221    fn parse_array<Callback>(
222        &self,
223        lexer: &mut Lexer,
224        pointer: &mut JsonPointer,
225        cb: &mut Callback,
226    ) -> ParserResult<()>
227    where
228        Callback: FnMut(&Event) -> ParserResult<()>,
229    {
230        let mut index = 0;
231        loop {
232            pointer.push_index(index);
233            match lexer.consume()? {
234                (Token::StartArray, span) => {
235                    emit_event!(cb, Match::StartArray, span, pointer)?;
236                    self.parse_array(lexer, pointer, cb)?;
237                }
238                (Token::EndArray, span) => {
239                    pointer.pop();
240                    return emit_event!(cb, Match::EndArray, span, pointer);
241                }
242                (Token::StartObject, span) => {
243                    emit_event!(cb, Match::StartObject, span, pointer)?;
244                    self.parse_object(lexer, pointer, cb)?;
245                }
246                (Token::Str(str), span) => {
247                    emit_event!(cb, Match::String(Cow::Borrowed(&str)), span, pointer)?;
248                }
249                (Token::Float(value), span) => {
250                    emit_event!(cb, Match::Float(value), span, pointer)?;
251                }
252                (Token::Integer(value), span) => {
253                    emit_event!(cb, Match::Integer(value), span, pointer)?;
254                }
255                (Token::Boolean(value), span) => {
256                    emit_event!(cb, Match::Boolean(value), span, pointer)?;
257                }
258                (Token::Null, span) => emit_event!(cb, Match::Null, span, pointer)?,
259                (Token::Comma, _) => index += 1,
260                (_token, span) => {
261                    return sax_parser_error!(ParserErrorDetails::InvalidArray, span.start);
262                }
263            }
264            pointer.pop();
265        }
266    }
267}
268
269#[cfg(test)]
270mod tests {
271    use std::io::BufReader;
272    use std::path::PathBuf;
273    use std::time::Instant;
274    use std::{env, fs};
275
276    use bytesize::ByteSize;
277
278    use crate::parsers::sax::Parser;
279    use crate::relative_file;
280    use crate::results::ParserErrorDetails;
281
282    #[test]
283    fn should_puke_on_empty_input() {
284        let input = "";
285        let parser = Parser::default();
286        let parsed = parser.parse_str(input, &mut |_e| Ok(()));
287        assert!(parsed.is_err());
288        assert_eq!(
289            parsed.err().unwrap().details,
290            ParserErrorDetails::ZeroLengthInput
291        );
292    }
293
294    #[test]
295    fn should_parse_successfully() {
296        let mut counter = 0;
297        let path = relative_file!("fixtures/json/valid/events.json");
298        let parser = Parser::default();
299        let parsed = parser.parse_file(&path, &mut |_e| {
300            counter += 1;
301            Ok(())
302        });
303        println!("{} SAX events processed", counter);
304        assert!(parsed.is_ok());
305    }
306
307    #[test]
308    fn should_successfully_bail() {
309        let path = relative_file!("fixtures/json/invalid/invalid_1.json");
310        let parser = Parser::default();
311        let parsed = parser.parse_file(&path, &mut |_e| Ok(()));
312        println!("Parse result = {:?}", parsed);
313        assert!(parsed.is_err());
314        assert_eq!(
315            parsed.err().unwrap().details,
316            ParserErrorDetails::InvalidRootObject
317        );
318    }
319
320    #[test]
321    fn should_allow_for_parsing_of_a_buffer() {
322        let input = "{ \"test\" : 2123232323}".as_bytes();
323        let mut buffer = BufReader::new(input);
324        let parser = Parser::default();
325        let _parsed = parser.parse_buffer(&mut buffer, &mut |_e| Ok(()));
326    }
327
328    #[test]
329    fn should_parse_basic_test_files() {
330        for f in fs::read_dir("fixtures/json/valid").unwrap() {
331            let path = f.unwrap().path();
332            println!("Parsing {:?}", &path);
333            if path.is_file() {
334                let mut counter = 0;
335                let len = fs::metadata(&path).unwrap().len();
336                let start = Instant::now();
337                let path = relative_file!(path.to_str().unwrap());
338                let parser = Parser::default();
339                let parsed = parser.parse_file(&path, &mut |_e| {
340                    counter += 1;
341                    Ok(())
342                });
343                if parsed.is_err() {
344                    println!("Parse of {:?} failed!", &path);
345                    println!("Parse failed with errors: {:?}", &parsed)
346                }
347                assert!(parsed.is_ok());
348                println!(
349                    "Parsed {} in {:?} [{:?}], {} SAX events processed",
350                    ByteSize(len),
351                    start.elapsed(),
352                    path,
353                    counter
354                );
355            }
356        }
357    }
358}