jsn 0.14.0

A library for querying streaming JSON tokens
Documentation
// Validate that JSON is well structured
// This is done using a state machine pattern

use crate::error::Reason;
use crate::iter::Format;
use crate::raw_token::RawToken;

// Encodes the current state of the JSON structure being parsed
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
enum State {
    // Initial state
    Begin,
    // End state
    End,
    // We just saw a top-level json primitive
    TopLevelPrimitive,
    // We just saw the start of an object
    ObjStart,
    // We just saw the end of an object
    ObjEnd,
    // We just saw the object's key
    ObjKey,
    // We just saw a colon, indicating a value comes next
    ObjColon,
    // We just saw the object's value
    ObjValue,
    // We jus saw a comma, indicating a key must come next
    ObjComma,
    // We just saw the start of an array
    ArrStart,
    // We just saw the end of an array
    ArrEnd,
    // We just saw an array value
    ArrValue,
    // We just saw a comma, indicating a new value must come next
    ArrComma,
}

// The type of json container currently being looked at
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
enum ContainerMarker {
    Array,
    Object,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct JsonStructure {
    state: State,
    format: Format,
    // State machines cannot validate that nested structures are properly matched up. So i use a
    // stack to keep track of where we are in object/array nesting
    stack: Vec<ContainerMarker>,
}

impl JsonStructure {
    // Figure out what the next state would be given `next_token`
    #[inline]
    fn transition(&mut self, next_token: &RawToken) -> Result<State, Reason> {
        use State::*;

        loop {
            return match (&self.state, next_token) {
                // ==
                // == BEGIN
                // ==

                // Json can only begin with an array, object or primitive value
                (Begin, RawToken::ArrayStart) => Ok(ArrStart),
                (Begin, RawToken::ObjectStart) => Ok(ObjStart),
                (Begin, t) if t.is_primitive_value() => Ok(TopLevelPrimitive),

                // ==
                // == TopLevelPrimitive
                // ==

                // Generally, nothing can come after a top level primitive.
                (TopLevelPrimitive, RawToken::Eof) => Ok(End),
                // Unless we are configured to read concatenated JSON
                (TopLevelPrimitive, _) if self.format == Format::Concatenated => {
                    self.state = Begin;
                    continue;
                }

                // ==
                // == Array tokens ==
                // ==

                // After an open bracket `[`, we only expect to see `[`, `]`, `{` or a primitive
                // value
                (ArrStart, RawToken::ArrayStart) => Ok(ArrStart),
                (ArrStart, RawToken::ArrayEnd) => Ok(ArrEnd),
                (ArrStart, RawToken::ObjectStart) => Ok(ObjStart),
                (ArrStart, t) if t.is_primitive_value() => Ok(ArrValue),

                // After an comma in an array, we only expect to see `[`, `{` or a primitive value
                (ArrComma, RawToken::ArrayStart) => Ok(ArrStart),
                (ArrComma, RawToken::ObjectStart) => Ok(ObjStart),
                (ArrComma, t) if t.is_primitive_value() => Ok(ArrValue),

                // After seeing a primitive array value, the only tokens we expect are `]` or `,`
                (ArrValue, RawToken::Comma) => Ok(ArrComma),
                (ArrValue, RawToken::ArrayEnd) => Ok(ArrEnd),

                // ==
                // == Object tokens ==
                // ==

                // After an open brace `{`, we only expect to see `}` or a string (the object's
                // first key)
                (ObjStart, RawToken::ObjectEnd) => Ok(ObjEnd),
                (ObjStart, RawToken::String(_)) => Ok(ObjKey),

                // After a comma in an object, we only expected to see a string
                (ObjComma, RawToken::String(_)) => Ok(ObjKey),

                // After seeing an object key, the only allowed token is a colon
                (ObjKey, RawToken::Colon) => Ok(ObjColon),

                // After seeing a colon, the only allowed tokens are `[`, `{`, or a primitive value
                (ObjColon, RawToken::ArrayStart) => Ok(ArrStart),
                (ObjColon, RawToken::ObjectStart) => Ok(ObjStart),
                (ObjColon, t) if t.is_primitive_value() => Ok(ObjValue),

                // After seeing a primitive object value, the only allowed tokens are `}` or a
                // `,`
                (ObjValue, RawToken::Comma) => Ok(ObjComma),
                (ObjValue, RawToken::ObjectEnd) => Ok(ObjEnd),

                // ==
                // == Shared transitions for object & array tokens
                // ==

                // If we just closed an array or object, and are at the end of input, we can go
                // to the `End` state. All structures need to be closed by this point
                (ArrEnd | ObjEnd, RawToken::Eof) => {
                    if !self.stack.is_empty() {
                        return Err(Reason::UnexpectedEof);
                    }
                    Ok(End)
                }
                // If we just closed an array or object and then saw a comma, the next state
                // depends on the current container
                (ArrEnd | ObjEnd, RawToken::Comma) => {
                    if let Some(ContainerMarker::Array) = self.stack.last() {
                        return Ok(ArrComma);
                    }
                    if let Some(ContainerMarker::Object) = self.stack.last() {
                        return Ok(ObjComma);
                    }

                    Err(Reason::UnexpectedChar)
                }
                // Seeing the end of an array after a `}` or `]` is only valid if
                // the current unclosed structure is an array
                (ArrEnd | ObjEnd, RawToken::ArrayEnd) => {
                    if let Some(ContainerMarker::Array) = self.stack.last() {
                        return Ok(ArrEnd);
                    }
                    return Err(Reason::UnexpectedChar);
                }
                // Seeing the end of an object after a `}` or `]` is only valid if
                // the current unclosed structure is an object
                (ArrEnd | ObjEnd, RawToken::ObjectEnd) => {
                    if let Some(ContainerMarker::Object) = self.stack.last() {
                        return Ok(ObjEnd);
                    }
                    return Err(Reason::UnexpectedChar);
                }
                // If we are configured to recognized concatenated JSON, and there are no unclosed
                // structures, we basically start validating a new json structure
                (ArrEnd | ObjEnd, _)
                    if self.format == Format::Concatenated && self.stack.is_empty() =>
                {
                    self.state = Begin;
                    continue;
                }

                // ==
                // == The end
                // ==

                // Seeing an eof token after we've fully parsed a json structure is a no-op
                (End, RawToken::Eof) => Ok(End),

                // Seeing an eof token any other time is unexpected
                (_, RawToken::Eof) => Err(Reason::UnexpectedEof),

                // Everything that has not been explicitely handled is unexpected
                _ => Err(Reason::UnexpectedChar),
            };
        }
    }

    // Any work that needs to be done after entering into a new state
    #[inline]
    fn on_enter_state(&mut self) {
        use State::*;

        match &self.state {
            ArrStart => {
                self.stack.push(ContainerMarker::Array);
            }
            ObjStart => {
                self.stack.push(ContainerMarker::Object);
            }
            ObjEnd | ArrEnd => {
                self.stack.pop();
            }
            _ => {}
        };
    }

    pub fn new(format: Format) -> Self {
        Self {
            state: State::Begin,
            format,
            stack: vec![],
        }
    }

    pub fn reset(&mut self) {
        self.state = State::Begin;
        self.stack.clear();
    }

    #[inline]
    pub fn validate<'t>(&mut self, token: RawToken<'t>) -> Result<RawToken<'t>, Reason> {
        self.state = self.transition(&token)?;
        self.on_enter_state();

        // Not sure where to put this yet: If the current state is `Object::Key`, we should spit out
        // an `ObjectKey token instead of a String one
        let token = match (self.state, &token) {
            (State::ObjKey, RawToken::String(s)) => RawToken::ObjectKey(s),
            _ => token,
        };

        Ok(token)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::input::Input;
    use crate::scan::Scanner;

    #[track_caller]
    fn pass<I>(format: Format, json: &str, expected: I)
    where
        I: IntoIterator<Item = RawToken<'static>>,
    {
        let mut input = Input::new(json.as_bytes());
        let mut scanner = Scanner::new();
        let mut structure = JsonStructure::new(format);

        for expected in expected.into_iter().collect::<Vec<RawToken>>() {
            let actual = scanner
                .read_token(&mut input)
                .and_then(|t| structure.validate(t))
                .expect("failed to parse json");

            assert_eq!(expected, actual);
        }

        // Make sure there is nothing else to parse
        assert_eq!(scanner.read_token(&mut input), Ok(RawToken::Eof));
    }

    #[track_caller]
    fn fail(format: Format, json: &str, expected: Reason) {
        let mut input = Input::new(json.as_bytes());
        let mut scanner = Scanner::new();
        let mut structure = JsonStructure::new(format);

        loop {
            let token = scanner
                .read_token(&mut input)
                .expect("scanning should succeed");

            match structure.validate(token) {
                Err(reason) => {
                    assert_eq!(reason, expected);
                    break;
                }
                Ok(RawToken::Eof) => panic!("json was unexpectedly valid: {}", json),
                _ => {}
            };
        }
    }

    #[test]
    fn nesting_tests() {
        fail(Format::Regular, "[[]", Reason::UnexpectedEof);
        fail(Format::Regular, "[]]", Reason::UnexpectedChar);
        fail(Format::Regular, "{}}", Reason::UnexpectedChar);
        fail(Format::Regular, "[null", Reason::UnexpectedEof);
    }

    #[test]
    fn streaming_json() {
        // Test that top level containers primitives can be followed by any other json value in
        // streaming mode
        pass(
            Format::Concatenated,
            "[]{}truenull1\"hello\"[]",
            [
                RawToken::ArrayStart,
                RawToken::ArrayEnd,
                RawToken::ObjectStart,
                RawToken::ObjectEnd,
                RawToken::Bool(true),
                RawToken::Null,
                RawToken::Number(b"1"),
                RawToken::String("hello"),
                RawToken::ArrayStart,
                RawToken::ArrayEnd,
                RawToken::Eof,
            ],
        );
    }
}