turbo_json_checker/
lib.rs

1//! `turbo-json-checker` is a library that provides JSON validation without
2//! keeping the stream of bytes in memory, it streams the bytes and validate it
3//! on the fly using a pushdown automaton.
4//!
5//! It returns the root type of the json [(Array, Object, String,
6//! ...)](crate::JsonType), followed by the index of its first and last non whitespace character (ex: `(Array, 1, 12)`).
7//!
8//! This library is a fork of [oxidized-json-checker](https://github.com/Kerollmops/oxidized-json-checker)
9//! which is itself an improvement of the [json.org](http://www.json.org/JSON_checker/) checker.
10//!
11//! # Example: validate some bytes
12//!
13//! This example shows how you can give the library a simple slice
14//! of bytes and validate that it is a valid JSON document.
15//!
16//! ```
17//! # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
18//! // index:     0                                        41
19//! //            |                                        |
20//! //            v                                        v
21//! let text = r#"["I", "am", "a", "valid", "JSON", "array"]"#;
22//! let bytes = text.as_bytes();
23//!
24//! let (json_type, start, end) = turbo_json_checker::validate(bytes)?;
25//!
26//! assert_eq!(json_type, turbo_json_checker::JsonType::Array);
27//! assert_eq!(start, 0);
28//! assert_eq!(end, 41);
29//! # Ok(()) }
30//! # fmain().unwrap()
31//! ```
32//!
33//! # Example: validate a stream of bytes
34//!
35//! This example shows that you can use any type that implements `io::Read`
36//! to the `JsonChecker` and validate that it is valid JSON.
37//!
38//! ```
39//! # const json_bytes: &[u8] = b"null";
40//! # fn streaming_from_the_web() -> std::io::Result<&'static [u8]> {
41//! #     Ok(json_bytes)
42//! # }
43//! # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
44//! let stream = streaming_from_the_web()?;
45//!
46//! turbo_json_checker::validate(stream)?;
47//! # Ok(()) }
48//! # fmain().unwrap()
49//! ```
50//!
51//! # Example: complex compositions
52//!
53//! This example show how you can use the `JsonChecker` type to check
54//! a compressed stream of bytes.
55//!
56//! You can decompress the stream, check it using the `JsonChecker`, and compress it
57//! again to pipe it elsewhere. All of that without much memory impact.
58//!
59//! ```no_run
60//! # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
61//! use std::io;
62//! use turbo_json_checker::JsonChecker;
63//!
64//! let stdin = io::stdin();
65//! let stdout = io::stdout();
66//!
67//! // Wrap the stdin reader in a Snappy reader
68//! // then wrap it in a JsonChecker reader.
69//! let rdr = snap::read::FrameDecoder::new(stdin.lock());
70//! let mut rdr = JsonChecker::new(rdr);
71//!
72//! // Wrap the stdout writer in a Snappy writer.
73//! let mut wtr = snap::write::FrameEncoder::new(stdout.lock());
74//!
75//! // The copy function will return any io error thrown by any of the reader,
76//! // the JsonChecker throw errors when invalid JSON is encountered.
77//! io::copy(&mut rdr, &mut wtr)?;
78//!
79//! // We must check that the final bytes were valid.
80//! rdr.finish()?;
81//! # Ok(()) }
82//! # fmain().unwrap()
83//! ```
84//!
85
86use crate::internals::{Class, Mode, State};
87use crate::internals::{ASCII_CLASS, STATE_TRANSITION_TABLE};
88use std::{fmt, io};
89
90mod internals;
91#[cfg(test)]
92mod tests;
93
94/// The error type returned by the `JsonChecker` type.
95#[derive(Copy, Clone, Debug, PartialEq)]
96pub enum Error {
97    InvalidCharacter,
98    EmptyCurlyBraces,
99    OrphanCurlyBrace,
100    OrphanSquareBrace,
101    MaxDepthReached,
102    InvalidQuote,
103    InvalidComma,
104    InvalidColon,
105    InvalidState,
106    IncompleteElement,
107}
108
109impl From<Error> for io::Error {
110    fn from(err: Error) -> io::Error {
111        io::Error::new(io::ErrorKind::Other, err)
112    }
113}
114
115impl std::error::Error for Error {}
116
117impl fmt::Display for Error {
118    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
119        match self {
120            Error::InvalidCharacter => f.write_str("invalid character"),
121            Error::EmptyCurlyBraces => f.write_str("empty curly braces"),
122            Error::OrphanCurlyBrace => f.write_str("orphan curly brace"),
123            Error::OrphanSquareBrace => f.write_str("orphan square brace"),
124            Error::MaxDepthReached => f.write_str("max depth reached"),
125            Error::InvalidQuote => f.write_str("invalid quote"),
126            Error::InvalidComma => f.write_str("invalid comma"),
127            Error::InvalidColon => f.write_str("invalid colon"),
128            Error::InvalidState => f.write_str("invalid state"),
129            Error::IncompleteElement => f.write_str("incomplete element"),
130        }
131    }
132}
133
134/// Represents any valid JSON type.
135#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
136pub enum JsonType {
137    Null,
138    Bool,
139    Number,
140    String,
141    Array,
142    Object,
143}
144
145/// A convenient method to check and consume JSON from a stream of bytes.
146///
147/// # Example
148///
149/// ```
150/// # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
151/// use turbo_json_checker::{validate, JsonType};
152/// let text = r#""I am a simple string!""#;
153/// let bytes = text.as_bytes();
154///
155/// let json_type = validate(bytes)?;
156/// assert_eq!(json_type, (JsonType::String, 0, text.len() - 1));
157/// # Ok(()) }
158/// # fmain().unwrap()
159/// ```
160pub fn validate<R: io::Read>(reader: R) -> io::Result<(JsonType, usize, usize)> {
161    let mut checker = JsonChecker::new(reader);
162    io::copy(&mut checker, &mut io::sink())?;
163    let outer_type = checker.finish()?;
164    Ok(outer_type)
165}
166
167/// A convenient method to check and consume JSON from an `str`.
168pub fn validate_str(string: &str) -> Result<(JsonType, usize, usize), Error> {
169    validate_bytes(string.as_bytes())
170}
171
172/// A convenient method to check and consume JSON from a bytes slice.
173pub fn validate_bytes(bytes: &[u8]) -> Result<(JsonType, usize, usize), Error> {
174    let mut checker = JsonChecker::new(());
175    checker.next_bytes(bytes)?;
176    checker.finish()
177}
178
179/// The `JsonChecker` is a `io::Read` adapter, it can be used like a pipe,
180/// reading bytes, checkings those and output the same bytes.
181///
182/// If an error is encountered, a JSON syntax error or an `io::Error`
183/// it is returned by the `io::Read::read` method.
184///
185/// # Safety
186///
187/// An error encountered while reading bytes will invalidate the checker.
188///
189/// # Example: read from a slice
190///
191/// ```
192/// # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
193/// use std::io;
194/// use turbo_json_checker::JsonChecker;
195///
196/// let text = r#"{"I am": "an object"}"#;
197/// let bytes = text.as_bytes();
198///
199/// let mut checker = JsonChecker::new(bytes);
200/// io::copy(&mut checker, &mut io::sink())?;
201/// checker.finish()?;
202/// # Ok(()) }
203/// # fmain().unwrap()
204/// ```
205pub struct JsonChecker<R> {
206    state: State,
207    error: Option<Error>,
208    outer_type: Option<JsonType>,
209    max_depth: usize,
210    stack: Vec<Mode>,
211    idx: usize,
212    start: Option<usize>,
213    end: Option<usize>,
214    reader: R,
215}
216
217impl<R> fmt::Debug for JsonChecker<R> {
218    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
219        f.debug_struct("JsonChecker").finish()
220    }
221}
222
223impl<R> JsonChecker<R> {
224    /// Construct a `JsonChecker. To continue the process, write to the `JsonChecker`
225    /// like a sink, and then call `JsonChecker::finish` to obtain the final result.
226    pub fn new(reader: R) -> JsonChecker<R> {
227        JsonChecker::with_max_depth(reader, usize::max_value())
228    }
229
230    /// Construct a `JsonChecker` and restrict the level of maximum nesting.
231    ///
232    /// For more information read the `JsonChecker::new` documentation.
233    pub fn with_max_depth(reader: R, max_depth: usize) -> JsonChecker<R> {
234        JsonChecker {
235            state: State::Go,
236            error: None,
237            outer_type: None,
238            max_depth,
239            stack: vec![Mode::Done],
240            idx: 0,
241            start: None,
242            end: None,
243            reader,
244        }
245    }
246
247    #[inline]
248    #[cfg(feature = "nightly")]
249    fn next_bytes(&mut self, bytes: &[u8]) -> Result<(), Error> {
250        use packed_simd::u8x8;
251
252        // TODO use chunks_exact instead?
253        // By using u8x8 instead of u8x16 we lost 2s on 16s but
254        // we are less prone to find state change requirements.
255        for chunk in bytes.chunks(u8x8::lanes()) {
256            if chunk.len() == u8x8::lanes() && self.state == State::St {
257                // Load the bytes into a SIMD type
258                let bytes = u8x8::from_slice_unaligned(chunk);
259
260                // According to the state STATE_TRANSITION_TABLE we are in the `St` state
261                // and *none of those bytes* are in the `CWhite`, `CQuote` or `CBacks` ascci class
262                // we can avoid processing them at all because they will not change the current state.
263
264                let cquotes = u8x8::splat(b'"');
265                let cbacks = u8x8::splat(b'\\');
266
267                let cwhites1 = u8x8::splat(b'\t');
268                let cwhites2 = u8x8::splat(b'\n');
269                let cwhites3 = u8x8::splat(b'\r');
270
271                // We first compare with quotes because this is the most
272                // common character we can encounter in valid JSON strings
273                // and this way we are able to skip other comparisons faster
274                if bytes.eq(cquotes).any()
275                    || bytes.eq(cbacks).any()
276                    || bytes.eq(cwhites1).any()
277                    || bytes.eq(cwhites2).any()
278                    || bytes.eq(cwhites3).any()
279                {
280                    chunk.iter().try_for_each(|b| self.next_byte(*b))?;
281                }
282
283                // Now that we checked that these bytes will not change
284                // the state we can continue to the next chunk and ignore them
285            } else {
286                chunk.iter().try_for_each(|b| self.next_byte(*b))?;
287            }
288        }
289
290        Ok(())
291    }
292
293    #[inline]
294    #[cfg(not(feature = "nightly"))]
295    fn next_bytes(&mut self, bytes: &[u8]) -> Result<(), Error> {
296        bytes.iter().try_for_each(|b| {
297            self.idx += 1;
298            self.next_byte(*b)
299        })
300    }
301
302    #[inline]
303    fn next_byte(&mut self, next_byte: u8) -> Result<(), Error> {
304        if let Some(error) = self.error {
305            return Err(error);
306        }
307
308        // We can potentially use try_blocks in the future.
309        fn internal_next_byte<R>(jc: &mut JsonChecker<R>, next_byte: u8) -> Result<(), Error> {
310            // Determine the character's class.
311            let next_class = if next_byte >= 128 {
312                Class::CEtc
313            } else {
314                ASCII_CLASS[next_byte as usize]
315            };
316
317            if next_class == Class::Invalid {
318                return Err(Error::InvalidCharacter);
319            }
320
321            // Get the next state from the state transition table and
322            // perform one of the actions.
323            let next_state = STATE_TRANSITION_TABLE[jc.state as usize][next_class as usize];
324
325            // Save the type we met if not already saved.
326            if jc.outer_type.is_none() {
327                match next_state {
328                    State::N1 => {
329                        jc.outer_type = Some(JsonType::Null);
330                        jc.start = Some(jc.idx);
331                    }
332                    State::T1 | State::F1 => {
333                        jc.outer_type = Some(JsonType::Bool);
334                        jc.start = Some(jc.idx);
335                    }
336                    State::In => {
337                        jc.outer_type = Some(JsonType::Number);
338                        jc.start = Some(jc.idx);
339                    }
340                    State::Wq => {
341                        jc.outer_type = Some(JsonType::String);
342                        jc.start = Some(jc.idx);
343                    }
344                    State::Wos => {
345                        jc.outer_type = Some(JsonType::Array);
346                        jc.start = Some(jc.idx);
347                    }
348                    State::Woc => {
349                        jc.outer_type = Some(JsonType::Object);
350                        jc.start = Some(jc.idx);
351                    }
352                    _ => (),
353                }
354            }
355
356            match next_state {
357                State::Wec => {
358                    // Empty }
359                    if !jc.pop(Mode::Key) {
360                        return Err(Error::EmptyCurlyBraces);
361                    }
362                    jc.state = State::Ok;
363                }
364                State::Wcu => {
365                    // }
366                    if !jc.pop(Mode::Object) {
367                        return Err(Error::OrphanCurlyBrace);
368                    }
369                    jc.state = State::Ok;
370                }
371                State::Ws => {
372                    // ]
373                    if !jc.pop(Mode::Array) {
374                        return Err(Error::OrphanSquareBrace);
375                    }
376                    jc.state = State::Ok;
377                }
378                State::Woc => {
379                    // {
380                    if !jc.push(Mode::Key) {
381                        return Err(Error::MaxDepthReached);
382                    }
383                    jc.state = State::Ob;
384                }
385                State::Wos => {
386                    // [
387                    if !jc.push(Mode::Array) {
388                        return Err(Error::MaxDepthReached);
389                    }
390                    jc.state = State::Ar;
391                }
392                State::Wq => {
393                    // "
394                    match jc.stack.last() {
395                        Some(Mode::Done) => {
396                            if !jc.push(Mode::String) {
397                                return Err(Error::MaxDepthReached);
398                            }
399                            jc.state = State::St;
400                        }
401                        Some(Mode::String) => {
402                            jc.pop(Mode::String);
403                            jc.state = State::Ok;
404                        }
405                        Some(Mode::Key) => jc.state = State::Co,
406                        Some(Mode::Array) | Some(Mode::Object) => jc.state = State::Ok,
407                        _ => return Err(Error::InvalidQuote),
408                    }
409                }
410                State::Wcm => {
411                    // ,
412                    match jc.stack.last() {
413                        Some(Mode::Object) => {
414                            // A comma causes a flip from object mode to key mode.
415                            if !jc.pop(Mode::Object) || !jc.push(Mode::Key) {
416                                return Err(Error::InvalidComma);
417                            }
418                            jc.state = State::Ke;
419                        }
420                        Some(Mode::Array) => jc.state = State::Va,
421                        _ => return Err(Error::InvalidComma),
422                    }
423                }
424                State::Wcl => {
425                    // :
426                    // A colon causes a flip from key mode to object mode.
427                    if !jc.pop(Mode::Key) || !jc.push(Mode::Object) {
428                        return Err(Error::InvalidColon);
429                    }
430                    jc.state = State::Va;
431                }
432                State::Invalid => return Err(Error::InvalidState),
433
434                // Or change the state.
435                state => {
436                    jc.state = state;
437                    if jc.stack.len() == 1 && jc.end.is_none() && state == State::Ok {
438                        jc.end = Some(jc.idx - 1); // If in state `OK` last state has already been poped. We must go back one char to be on the last character of the previous State.
439                    }
440                }
441            }
442
443            Ok(())
444        }
445
446        // By catching returned errors when this `JsonChecker` is used we *fuse*
447        // the checker and ensure the user don't use a checker in an invalid state.
448        if let Err(error) = internal_next_byte(self, next_byte) {
449            self.error = Some(error);
450            return Err(error);
451        }
452
453        Ok(())
454    }
455
456    /// The `JsonChecker::finish` method must be called after all of the characters
457    /// have been processed.
458    ///
459    /// This function consumes the `JsonChecker` and returns `Ok(JsonType)` if the
460    /// JSON text was accepted and the JSON type guessed.
461    pub fn finish(self) -> Result<(JsonType, usize, usize), Error> {
462        self.into_inner().map(|(_, t, start, end)| (t, start, end))
463    }
464
465    /// The `JsonChecker::into_inner` does the same as the `JsonChecker::finish`
466    /// method but returns the internal reader along with the JSON type guessed.
467    pub fn into_inner(mut self) -> Result<(R, JsonType, usize, usize), Error> {
468        let is_state_valid = match self.state {
469            State::Ok | State::In | State::Fr | State::Fs | State::E3 => true,
470            _ => false,
471        };
472
473        if is_state_valid && self.pop(Mode::Done) {
474            let outer_type = self
475                .outer_type
476                .expect("BUG: the outer type must have been guessed");
477            return Ok((
478                self.reader,
479                outer_type,
480                self.start.unwrap() - 1,
481                self.end.unwrap_or(self.idx) - 1,
482            ));
483        }
484
485        // We do not need to catch this error to *fuse* the checker because this method
486        // consumes the checker, it cannot be reused after an error has been thrown.
487        Err(Error::IncompleteElement)
488    }
489
490    /// Push a mode onto the stack. Returns false if max depth is reached.
491    fn push(&mut self, mode: Mode) -> bool {
492        if self.stack.len() + 1 >= self.max_depth {
493            return false;
494        }
495        self.stack.push(mode);
496        return true;
497    }
498
499    /// Pop the stack, assuring that the current mode matches the expectation.
500    /// Return false if the stack is empty or if the modes mismatch.
501    fn pop(&mut self, mode: Mode) -> bool {
502        self.stack.pop() == Some(mode)
503    }
504}
505
506impl<R: io::Read> io::Read for JsonChecker<R> {
507    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
508        // If an error have already been encountered we return it,
509        // this *fuses* the JsonChecker.
510        if let Some(error) = self.error {
511            return Err(error.into());
512        }
513
514        let len = match self.reader.read(buf) {
515            Err(error) => {
516                // We do not store the io::Error in the JsonChecker Error
517                // type instead we use the IncompleteElement error.
518                self.error = Some(Error::IncompleteElement);
519                return Err(error);
520            }
521            Ok(len) => len,
522        };
523
524        self.next_bytes(&buf[..len])?;
525
526        Ok(len)
527    }
528}