eventson 0.1.0

An event based JSON parser with competitive performance
Documentation
use eventson::error::{self, Error, InvalidInput};
use eventson::{Event, Parser, lex};
use serde_json::{Map, Value};
use std::io::{Cursor, Read};

const JSON_CHECKER_DIR: &str = "data/jsonchecker";
const ROUNDTRIP_DIR: &str = "data/roundtrip";
const OTHER_DIR: &str = "data/other";

parity_test_file!(JSON_CHECKER_DIR, pass01);
parity_test_file!(JSON_CHECKER_DIR, pass02);
parity_test_file!(JSON_CHECKER_DIR, pass03);

parity_test_file!(OTHER_DIR, canada);
parity_test_file!(OTHER_DIR, twitter);
parity_test_file!(OTHER_DIR, citm_catalog);
parity_test_file!(OTHER_DIR, bitcoin);
parity_test_file!(OTHER_DIR, bitcoin2);
parity_test_file!(OTHER_DIR, foodfacts);
parity_test_file!(OTHER_DIR, pokedex);
parity_test_file!(OTHER_DIR, mtg);
parity_test_file!(OTHER_DIR, german_events);
parity_test_file!(OTHER_DIR, catalan_events);

parity_test_file!(ROUNDTRIP_DIR, roundtrip01);
parity_test_file!(ROUNDTRIP_DIR, roundtrip02);
parity_test_file!(ROUNDTRIP_DIR, roundtrip03);
parity_test_file!(ROUNDTRIP_DIR, roundtrip04);
parity_test_file!(ROUNDTRIP_DIR, roundtrip05);
parity_test_file!(ROUNDTRIP_DIR, roundtrip06);
parity_test_file!(ROUNDTRIP_DIR, roundtrip07);
parity_test_file!(ROUNDTRIP_DIR, roundtrip08);
parity_test_file!(ROUNDTRIP_DIR, roundtrip09);
parity_test_file!(ROUNDTRIP_DIR, roundtrip10);
parity_test_file!(ROUNDTRIP_DIR, roundtrip11);
parity_test_file!(ROUNDTRIP_DIR, roundtrip12);
parity_test_file!(ROUNDTRIP_DIR, roundtrip13);
parity_test_file!(ROUNDTRIP_DIR, roundtrip14);
parity_test_file!(ROUNDTRIP_DIR, roundtrip15);
parity_test_file!(ROUNDTRIP_DIR, roundtrip16);
parity_test_file!(ROUNDTRIP_DIR, roundtrip17);
parity_test_file!(ROUNDTRIP_DIR, roundtrip18);
parity_test_file!(ROUNDTRIP_DIR, roundtrip19);
parity_test_file!(ROUNDTRIP_DIR, roundtrip20);
parity_test_file!(ROUNDTRIP_DIR, roundtrip21);
parity_test_file!(ROUNDTRIP_DIR, roundtrip22);
parity_test_file!(ROUNDTRIP_DIR, roundtrip23);
parity_test_file!(ROUNDTRIP_DIR, roundtrip24);
parity_test_file!(ROUNDTRIP_DIR, roundtrip25);
parity_test_file!(ROUNDTRIP_DIR, roundtrip27);
parity_test_file!(ROUNDTRIP_DIR, roundtrip26);

parity_test_lit!(big_int, r#"{ "foo": 505871615125491712} "#);
parity_test_lit!(simple_object, r#"{ "xq": 1, "yq": 2 }"#);
parity_test_lit!(empty_object, "{}");
parity_test_lit!(empty_list, "[]");

parity_test_lit!(
    complex_object,
    r#"
      {
        "foo": "bar",
        "foo2": 123,
        "foo3": 1.234,
        "foo4": true,
        "foo5": false,
        "foo6": null,
        "foo7": [
          "itsalit",
          1,
          111.111,
          true,
          false,
          {
            "baz": "bing",
            "arrr": [
              {
                "diabolical": "yes"
              }
            ]
          }
        ],
        ":)": "val"
      }
    "#
);

parity_test_lit!(
    simplified_canada,
    r#"
      {
        "geometry": {
          "coordinates": [
            [
              [
                -10,
                -20
              ]
            ],
            [
              [
                -50,
                  43
              ]
            ]
          ]
        }
      }
    "#
);

#[test]
#[ignore = "This will take a while..."]
fn exhaustive_buf_size_parity_test() {
    // 4096 is more than double the size of the smallest token in this dataset
    let mut buf_size = 4096;
    let input = &std::fs::read_to_string(format!("{}/german_events.json", OTHER_DIR)).unwrap();
    while buf_size > 0 {
        println!("Testing buffer size: {}", buf_size);
        let input_clone = input.clone();
        let expected: Value = serde_json::from_str(&input_clone).unwrap();
        let mut parser = Parser::new(Cursor::new(input_clone), buf_size, 1024);
        let actual = to_serde_json_value(&mut parser);
        match actual {
            // Success, we got down to the minimum buffer size for a token in
            // this dataset before crashing
            Err(Error::TokenTooLarge) => {
                break;
            }
            // Unterminated string is also a valid case here for the buffer getting
            // too small.
            Err(Error::InvalidInput { ref input, .. }) => {
                if InvalidInput::UnterminatedString == *input {
                    break;
                }
            }
            Err(e) => {
                panic!(
                    "Unexpected error in test for size: '{}' - {:?}",
                    buf_size, e
                );
            }
            _ => {}
        }
        let actual = actual.unwrap().unwrap();
        parity_test_helper(
            &format!("buf_size_parity_test_{}", buf_size),
            expected,
            actual,
        );

        buf_size -= 1;
    }
}

// Where to write out the results of failed parity tests
const FAILS_DIRECTORY: &str = "tests/failures";

// Value based on nothing but feel, perhaps there's a better option
const FLOAT_TOLERANCE: f64 = 1e-12;

#[macro_export]
macro_rules! parity_test_file {
    ($dir:ident, $file:ident) => {
        #[test]
        fn $file() {
            let input =
                &std::fs::read_to_string(format!("{}/{}.json", $dir, stringify!($file))).unwrap();
            run_parity_test(stringify!($name), &input);
        }
    };
}

#[macro_export]
macro_rules! parity_test_lit {
    ($name:ident, $input:expr) => {
        #[test]
        fn $name() {
            run_parity_test(stringify!($name), $input);
        }
    };
}

fn run_parity_test(name: &str, input: &str) {
    let mut parser = Parser::new(Cursor::new(input), 1024 * 2, 1024);
    let expected: Value = serde_json::from_str(input).unwrap();
    let actual = to_serde_json_value(&mut parser).unwrap().unwrap();
    parity_test_helper(name, expected, actual);
}

fn parity_test_helper(name: &str, expected: Value, actual: Value) {
    if are_within_tolerance(expected.clone(), actual.clone()) {
        return;
    }

    if !std::path::Path::new(FAILS_DIRECTORY).exists() {
        std::fs::create_dir(FAILS_DIRECTORY).unwrap();
    }

    // We spit them out as .txt files because a lot of editor tools choke on
    // large files that they have to highlight/format.
    let expected_file =
        std::fs::File::create(format!("{}/{}_{}", FAILS_DIRECTORY, name, "expected.txt")).unwrap();
    let actual_file =
        std::fs::File::create(format!("{}/{}_{}", FAILS_DIRECTORY, name, "actual.txt")).unwrap();

    serde_json::to_writer_pretty(expected_file, &expected).unwrap();
    serde_json::to_writer_pretty(actual_file, &actual).unwrap();

    panic!("Expected and actual not equal")
}

// serde_json uses a different float parsing logic/library than we do and there
// are very small differences at the tail end of numbers that are otherwise
// equal. This will compare values allowing for a `[FLOAT_TOLERANCE]` diff
// in floating points numbers.
fn are_within_tolerance(v1: Value, v2: Value) -> bool {
    match (v1, v2) {
        (Value::Null, Value::Null) => true,
        (Value::Bool(a), Value::Bool(b)) => a == b,
        (Value::String(a), Value::String(b)) => a == b,
        (Value::Number(a), Value::Number(b)) => {
            if a.is_i64() && b.is_i64() {
                return a.as_i64() == b.as_i64();
            }

            if a.is_u64() && b.is_u64() {
                return a.as_u64() == b.as_u64();
            }

            if a.is_f64() && b.is_f64() {
                return floats_are_within_tolerance(a.as_f64().unwrap(), b.as_f64().unwrap());
            }

            false
        }
        (Value::Array(a), Value::Array(b)) => {
            if a.len() != b.len() {
                return false;
            }
            a.into_iter()
                .zip(b)
                .all(|(val_a, val_b)| are_within_tolerance(val_a, val_b))
        }
        (Value::Object(a), Value::Object(b)) => {
            if a.len() != b.len() {
                return false;
            }

            for (key, val_a) in a {
                if let Some(val_b) = b.get(&key) {
                    if !are_within_tolerance(val_a, val_b.clone()) {
                        return false;
                    }
                } else {
                    return false;
                }
            }

            true
        }
        _ => false, // Different types
    }
}

fn to_serde_json_value<R: Read>(
    parser: &mut Parser<R>,
) -> error::Result<Option<serde_json::Value>> {
    let value = match parser.next()? {
        Event::StartObject => Some(create_object(parser)?),
        Event::True => Some(Value::Bool(true)),
        Event::False => Some(Value::Bool(false)),
        Event::Null => Some(Value::Null),
        Event::Number(n) => {
            let number = match n {
                lex::Number::PositiveInt(i) => serde_json::Number::from(i),
                lex::Number::NegativeInt(i) => serde_json::Number::from(i),
                lex::Number::Float(f) => serde_json::Number::from_f64(f).unwrap(),
            };
            Some(Value::Number(number))
        }
        Event::String(s) => Some(Value::String(String::from_utf8(s.to_owned()).unwrap())),
        Event::StartArray => Some(create_list(parser)?),
        Event::EndArray => None,
        _ => {
            // get_object will eat all of the EndObject events. Every case should
            // be accounted for by the parser or it will return an error.
            unreachable!()
        }
    };

    Ok(value)
}

fn create_object<R: Read>(parser: &mut Parser<R>) -> error::Result<serde_json::Value> {
    let mut map = Map::new();
    loop {
        match parser.next()? {
            Event::EndObject => {
                break;
            }
            Event::ObjectKey(key) => {
                let key = String::from_utf8(key.to_owned()).unwrap();
                // There has to be a value here or the parser will return an error
                let value = to_serde_json_value(parser)?.unwrap();
                map.insert(key, value);
            }
            _ => {
                unreachable!("Parser should either emit errors or produce valid events");
            }
        }
    }
    Ok(Value::Object(map))
}

fn create_list<R: Read>(parser: &mut Parser<R>) -> error::Result<serde_json::Value> {
    let mut list = vec![];
    while let Some(val) = to_serde_json_value(parser)? {
        list.push(val);
    }
    Ok(Value::Array(list))
}

fn floats_are_within_tolerance(a: f64, b: f64) -> bool {
    if a == b {
        return true;
    }

    if a.is_nan() || b.is_nan() {
        return false;
    }

    let diff = (a - b).abs();
    let largest = a.abs().max(b.abs());

    diff <= FLOAT_TOLERANCE * largest
}