turbo_json_checker/lib.rs
1//! `turbo-json-checker` is a library that provides JSON validation without
2//! keeping the stream of bytes in memory, it streams the bytes and validate it
3//! on the fly using a pushdown automaton.
4//!
5//! It returns the root type of the json [(Array, Object, String,
6//! ...)](crate::JsonType), followed by the index of its first and last non whitespace character (ex: `(Array, 1, 12)`).
7//!
8//! This library is a fork of [oxidized-json-checker](https://github.com/Kerollmops/oxidized-json-checker)
9//! which is itself an improvement of the [json.org](http://www.json.org/JSON_checker/) checker.
10//!
11//! # Example: validate some bytes
12//!
13//! This example shows how you can give the library a simple slice
14//! of bytes and validate that it is a valid JSON document.
15//!
16//! ```
17//! # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
18//! // index: 0 41
19//! // | |
20//! // v v
21//! let text = r#"["I", "am", "a", "valid", "JSON", "array"]"#;
22//! let bytes = text.as_bytes();
23//!
24//! let (json_type, start, end) = turbo_json_checker::validate(bytes)?;
25//!
26//! assert_eq!(json_type, turbo_json_checker::JsonType::Array);
27//! assert_eq!(start, 0);
28//! assert_eq!(end, 41);
29//! # Ok(()) }
30//! # fmain().unwrap()
31//! ```
32//!
33//! # Example: validate a stream of bytes
34//!
35//! This example shows that you can use any type that implements `io::Read`
36//! to the `JsonChecker` and validate that it is valid JSON.
37//!
38//! ```
39//! # const json_bytes: &[u8] = b"null";
40//! # fn streaming_from_the_web() -> std::io::Result<&'static [u8]> {
41//! # Ok(json_bytes)
42//! # }
43//! # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
44//! let stream = streaming_from_the_web()?;
45//!
46//! turbo_json_checker::validate(stream)?;
47//! # Ok(()) }
48//! # fmain().unwrap()
49//! ```
50//!
51//! # Example: complex compositions
52//!
53//! This example show how you can use the `JsonChecker` type to check
54//! a compressed stream of bytes.
55//!
56//! You can decompress the stream, check it using the `JsonChecker`, and compress it
57//! again to pipe it elsewhere. All of that without much memory impact.
58//!
59//! ```no_run
60//! # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
61//! use std::io;
62//! use turbo_json_checker::JsonChecker;
63//!
64//! let stdin = io::stdin();
65//! let stdout = io::stdout();
66//!
67//! // Wrap the stdin reader in a Snappy reader
68//! // then wrap it in a JsonChecker reader.
69//! let rdr = snap::read::FrameDecoder::new(stdin.lock());
70//! let mut rdr = JsonChecker::new(rdr);
71//!
72//! // Wrap the stdout writer in a Snappy writer.
73//! let mut wtr = snap::write::FrameEncoder::new(stdout.lock());
74//!
75//! // The copy function will return any io error thrown by any of the reader,
76//! // the JsonChecker throw errors when invalid JSON is encountered.
77//! io::copy(&mut rdr, &mut wtr)?;
78//!
79//! // We must check that the final bytes were valid.
80//! rdr.finish()?;
81//! # Ok(()) }
82//! # fmain().unwrap()
83//! ```
84//!
85
86use crate::internals::{Class, Mode, State};
87use crate::internals::{ASCII_CLASS, STATE_TRANSITION_TABLE};
88use std::{fmt, io};
89
90mod internals;
91#[cfg(test)]
92mod tests;
93
94/// The error type returned by the `JsonChecker` type.
95#[derive(Copy, Clone, Debug, PartialEq)]
96pub enum Error {
97 InvalidCharacter,
98 EmptyCurlyBraces,
99 OrphanCurlyBrace,
100 OrphanSquareBrace,
101 MaxDepthReached,
102 InvalidQuote,
103 InvalidComma,
104 InvalidColon,
105 InvalidState,
106 IncompleteElement,
107}
108
109impl From<Error> for io::Error {
110 fn from(err: Error) -> io::Error {
111 io::Error::new(io::ErrorKind::Other, err)
112 }
113}
114
115impl std::error::Error for Error {}
116
117impl fmt::Display for Error {
118 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
119 match self {
120 Error::InvalidCharacter => f.write_str("invalid character"),
121 Error::EmptyCurlyBraces => f.write_str("empty curly braces"),
122 Error::OrphanCurlyBrace => f.write_str("orphan curly brace"),
123 Error::OrphanSquareBrace => f.write_str("orphan square brace"),
124 Error::MaxDepthReached => f.write_str("max depth reached"),
125 Error::InvalidQuote => f.write_str("invalid quote"),
126 Error::InvalidComma => f.write_str("invalid comma"),
127 Error::InvalidColon => f.write_str("invalid colon"),
128 Error::InvalidState => f.write_str("invalid state"),
129 Error::IncompleteElement => f.write_str("incomplete element"),
130 }
131 }
132}
133
134/// Represents any valid JSON type.
135#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
136pub enum JsonType {
137 Null,
138 Bool,
139 Number,
140 String,
141 Array,
142 Object,
143}
144
145/// A convenient method to check and consume JSON from a stream of bytes.
146///
147/// # Example
148///
149/// ```
150/// # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
151/// use turbo_json_checker::{validate, JsonType};
152/// let text = r#""I am a simple string!""#;
153/// let bytes = text.as_bytes();
154///
155/// let json_type = validate(bytes)?;
156/// assert_eq!(json_type, (JsonType::String, 0, text.len() - 1));
157/// # Ok(()) }
158/// # fmain().unwrap()
159/// ```
160pub fn validate<R: io::Read>(reader: R) -> io::Result<(JsonType, usize, usize)> {
161 let mut checker = JsonChecker::new(reader);
162 io::copy(&mut checker, &mut io::sink())?;
163 let outer_type = checker.finish()?;
164 Ok(outer_type)
165}
166
167/// A convenient method to check and consume JSON from an `str`.
168pub fn validate_str(string: &str) -> Result<(JsonType, usize, usize), Error> {
169 validate_bytes(string.as_bytes())
170}
171
172/// A convenient method to check and consume JSON from a bytes slice.
173pub fn validate_bytes(bytes: &[u8]) -> Result<(JsonType, usize, usize), Error> {
174 let mut checker = JsonChecker::new(());
175 checker.next_bytes(bytes)?;
176 checker.finish()
177}
178
179/// The `JsonChecker` is a `io::Read` adapter, it can be used like a pipe,
180/// reading bytes, checkings those and output the same bytes.
181///
182/// If an error is encountered, a JSON syntax error or an `io::Error`
183/// it is returned by the `io::Read::read` method.
184///
185/// # Safety
186///
187/// An error encountered while reading bytes will invalidate the checker.
188///
189/// # Example: read from a slice
190///
191/// ```
192/// # fn fmain() -> Result<(), Box<dyn std::error::Error>> {
193/// use std::io;
194/// use turbo_json_checker::JsonChecker;
195///
196/// let text = r#"{"I am": "an object"}"#;
197/// let bytes = text.as_bytes();
198///
199/// let mut checker = JsonChecker::new(bytes);
200/// io::copy(&mut checker, &mut io::sink())?;
201/// checker.finish()?;
202/// # Ok(()) }
203/// # fmain().unwrap()
204/// ```
205pub struct JsonChecker<R> {
206 state: State,
207 error: Option<Error>,
208 outer_type: Option<JsonType>,
209 max_depth: usize,
210 stack: Vec<Mode>,
211 idx: usize,
212 start: Option<usize>,
213 end: Option<usize>,
214 reader: R,
215}
216
217impl<R> fmt::Debug for JsonChecker<R> {
218 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
219 f.debug_struct("JsonChecker").finish()
220 }
221}
222
223impl<R> JsonChecker<R> {
224 /// Construct a `JsonChecker. To continue the process, write to the `JsonChecker`
225 /// like a sink, and then call `JsonChecker::finish` to obtain the final result.
226 pub fn new(reader: R) -> JsonChecker<R> {
227 JsonChecker::with_max_depth(reader, usize::max_value())
228 }
229
230 /// Construct a `JsonChecker` and restrict the level of maximum nesting.
231 ///
232 /// For more information read the `JsonChecker::new` documentation.
233 pub fn with_max_depth(reader: R, max_depth: usize) -> JsonChecker<R> {
234 JsonChecker {
235 state: State::Go,
236 error: None,
237 outer_type: None,
238 max_depth,
239 stack: vec![Mode::Done],
240 idx: 0,
241 start: None,
242 end: None,
243 reader,
244 }
245 }
246
247 #[inline]
248 #[cfg(feature = "nightly")]
249 fn next_bytes(&mut self, bytes: &[u8]) -> Result<(), Error> {
250 use packed_simd::u8x8;
251
252 // TODO use chunks_exact instead?
253 // By using u8x8 instead of u8x16 we lost 2s on 16s but
254 // we are less prone to find state change requirements.
255 for chunk in bytes.chunks(u8x8::lanes()) {
256 if chunk.len() == u8x8::lanes() && self.state == State::St {
257 // Load the bytes into a SIMD type
258 let bytes = u8x8::from_slice_unaligned(chunk);
259
260 // According to the state STATE_TRANSITION_TABLE we are in the `St` state
261 // and *none of those bytes* are in the `CWhite`, `CQuote` or `CBacks` ascci class
262 // we can avoid processing them at all because they will not change the current state.
263
264 let cquotes = u8x8::splat(b'"');
265 let cbacks = u8x8::splat(b'\\');
266
267 let cwhites1 = u8x8::splat(b'\t');
268 let cwhites2 = u8x8::splat(b'\n');
269 let cwhites3 = u8x8::splat(b'\r');
270
271 // We first compare with quotes because this is the most
272 // common character we can encounter in valid JSON strings
273 // and this way we are able to skip other comparisons faster
274 if bytes.eq(cquotes).any()
275 || bytes.eq(cbacks).any()
276 || bytes.eq(cwhites1).any()
277 || bytes.eq(cwhites2).any()
278 || bytes.eq(cwhites3).any()
279 {
280 chunk.iter().try_for_each(|b| self.next_byte(*b))?;
281 }
282
283 // Now that we checked that these bytes will not change
284 // the state we can continue to the next chunk and ignore them
285 } else {
286 chunk.iter().try_for_each(|b| self.next_byte(*b))?;
287 }
288 }
289
290 Ok(())
291 }
292
293 #[inline]
294 #[cfg(not(feature = "nightly"))]
295 fn next_bytes(&mut self, bytes: &[u8]) -> Result<(), Error> {
296 bytes.iter().try_for_each(|b| {
297 self.idx += 1;
298 self.next_byte(*b)
299 })
300 }
301
302 #[inline]
303 fn next_byte(&mut self, next_byte: u8) -> Result<(), Error> {
304 if let Some(error) = self.error {
305 return Err(error);
306 }
307
308 // We can potentially use try_blocks in the future.
309 fn internal_next_byte<R>(jc: &mut JsonChecker<R>, next_byte: u8) -> Result<(), Error> {
310 // Determine the character's class.
311 let next_class = if next_byte >= 128 {
312 Class::CEtc
313 } else {
314 ASCII_CLASS[next_byte as usize]
315 };
316
317 if next_class == Class::Invalid {
318 return Err(Error::InvalidCharacter);
319 }
320
321 // Get the next state from the state transition table and
322 // perform one of the actions.
323 let next_state = STATE_TRANSITION_TABLE[jc.state as usize][next_class as usize];
324
325 // Save the type we met if not already saved.
326 if jc.outer_type.is_none() {
327 match next_state {
328 State::N1 => {
329 jc.outer_type = Some(JsonType::Null);
330 jc.start = Some(jc.idx);
331 }
332 State::T1 | State::F1 => {
333 jc.outer_type = Some(JsonType::Bool);
334 jc.start = Some(jc.idx);
335 }
336 State::In => {
337 jc.outer_type = Some(JsonType::Number);
338 jc.start = Some(jc.idx);
339 }
340 State::Wq => {
341 jc.outer_type = Some(JsonType::String);
342 jc.start = Some(jc.idx);
343 }
344 State::Wos => {
345 jc.outer_type = Some(JsonType::Array);
346 jc.start = Some(jc.idx);
347 }
348 State::Woc => {
349 jc.outer_type = Some(JsonType::Object);
350 jc.start = Some(jc.idx);
351 }
352 _ => (),
353 }
354 }
355
356 match next_state {
357 State::Wec => {
358 // Empty }
359 if !jc.pop(Mode::Key) {
360 return Err(Error::EmptyCurlyBraces);
361 }
362 jc.state = State::Ok;
363 }
364 State::Wcu => {
365 // }
366 if !jc.pop(Mode::Object) {
367 return Err(Error::OrphanCurlyBrace);
368 }
369 jc.state = State::Ok;
370 }
371 State::Ws => {
372 // ]
373 if !jc.pop(Mode::Array) {
374 return Err(Error::OrphanSquareBrace);
375 }
376 jc.state = State::Ok;
377 }
378 State::Woc => {
379 // {
380 if !jc.push(Mode::Key) {
381 return Err(Error::MaxDepthReached);
382 }
383 jc.state = State::Ob;
384 }
385 State::Wos => {
386 // [
387 if !jc.push(Mode::Array) {
388 return Err(Error::MaxDepthReached);
389 }
390 jc.state = State::Ar;
391 }
392 State::Wq => {
393 // "
394 match jc.stack.last() {
395 Some(Mode::Done) => {
396 if !jc.push(Mode::String) {
397 return Err(Error::MaxDepthReached);
398 }
399 jc.state = State::St;
400 }
401 Some(Mode::String) => {
402 jc.pop(Mode::String);
403 jc.state = State::Ok;
404 }
405 Some(Mode::Key) => jc.state = State::Co,
406 Some(Mode::Array) | Some(Mode::Object) => jc.state = State::Ok,
407 _ => return Err(Error::InvalidQuote),
408 }
409 }
410 State::Wcm => {
411 // ,
412 match jc.stack.last() {
413 Some(Mode::Object) => {
414 // A comma causes a flip from object mode to key mode.
415 if !jc.pop(Mode::Object) || !jc.push(Mode::Key) {
416 return Err(Error::InvalidComma);
417 }
418 jc.state = State::Ke;
419 }
420 Some(Mode::Array) => jc.state = State::Va,
421 _ => return Err(Error::InvalidComma),
422 }
423 }
424 State::Wcl => {
425 // :
426 // A colon causes a flip from key mode to object mode.
427 if !jc.pop(Mode::Key) || !jc.push(Mode::Object) {
428 return Err(Error::InvalidColon);
429 }
430 jc.state = State::Va;
431 }
432 State::Invalid => return Err(Error::InvalidState),
433
434 // Or change the state.
435 state => {
436 jc.state = state;
437 if jc.stack.len() == 1 && jc.end.is_none() && state == State::Ok {
438 jc.end = Some(jc.idx - 1); // If in state `OK` last state has already been poped. We must go back one char to be on the last character of the previous State.
439 }
440 }
441 }
442
443 Ok(())
444 }
445
446 // By catching returned errors when this `JsonChecker` is used we *fuse*
447 // the checker and ensure the user don't use a checker in an invalid state.
448 if let Err(error) = internal_next_byte(self, next_byte) {
449 self.error = Some(error);
450 return Err(error);
451 }
452
453 Ok(())
454 }
455
456 /// The `JsonChecker::finish` method must be called after all of the characters
457 /// have been processed.
458 ///
459 /// This function consumes the `JsonChecker` and returns `Ok(JsonType)` if the
460 /// JSON text was accepted and the JSON type guessed.
461 pub fn finish(self) -> Result<(JsonType, usize, usize), Error> {
462 self.into_inner().map(|(_, t, start, end)| (t, start, end))
463 }
464
465 /// The `JsonChecker::into_inner` does the same as the `JsonChecker::finish`
466 /// method but returns the internal reader along with the JSON type guessed.
467 pub fn into_inner(mut self) -> Result<(R, JsonType, usize, usize), Error> {
468 let is_state_valid = match self.state {
469 State::Ok | State::In | State::Fr | State::Fs | State::E3 => true,
470 _ => false,
471 };
472
473 if is_state_valid && self.pop(Mode::Done) {
474 let outer_type = self
475 .outer_type
476 .expect("BUG: the outer type must have been guessed");
477 return Ok((
478 self.reader,
479 outer_type,
480 self.start.unwrap() - 1,
481 self.end.unwrap_or(self.idx) - 1,
482 ));
483 }
484
485 // We do not need to catch this error to *fuse* the checker because this method
486 // consumes the checker, it cannot be reused after an error has been thrown.
487 Err(Error::IncompleteElement)
488 }
489
490 /// Push a mode onto the stack. Returns false if max depth is reached.
491 fn push(&mut self, mode: Mode) -> bool {
492 if self.stack.len() + 1 >= self.max_depth {
493 return false;
494 }
495 self.stack.push(mode);
496 return true;
497 }
498
499 /// Pop the stack, assuring that the current mode matches the expectation.
500 /// Return false if the stack is empty or if the modes mismatch.
501 fn pop(&mut self, mode: Mode) -> bool {
502 self.stack.pop() == Some(mode)
503 }
504}
505
506impl<R: io::Read> io::Read for JsonChecker<R> {
507 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
508 // If an error have already been encountered we return it,
509 // this *fuses* the JsonChecker.
510 if let Some(error) = self.error {
511 return Err(error.into());
512 }
513
514 let len = match self.reader.read(buf) {
515 Err(error) => {
516 // We do not store the io::Error in the JsonChecker Error
517 // type instead we use the IncompleteElement error.
518 self.error = Some(Error::IncompleteElement);
519 return Err(error);
520 }
521 Ok(len) => len,
522 };
523
524 self.next_bytes(&buf[..len])?;
525
526 Ok(len)
527 }
528}