jsn 0.14.0

A library for querying streaming JSON tokens
Documentation
use crate::error::{JsonError, Position};
use crate::input::Input;
use crate::mask::{self, Mask};
use crate::raw_token::RawToken;
use crate::scan::Scanner;
use crate::structure::JsonStructure;
use crate::token::Token;
use std::io::{self, Read};

/// Format of the JSON input
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum Format {
    /// Regular, strict RFC8259 JSON
    Regular,
    /// Concatenated or newline-delemited JSON
    Concatenated,
}

impl Default for Format {
    fn default() -> Self {
        Self::Regular
    }
}

/// A streaming JSON token pull parser
///
/// Implements [`IntoIterator`] so it can be directly used in a `for` loop:
/// ```
/// use jsn::TokenReader;
///
/// for token in TokenReader::new("[]".as_bytes()) {
///     println!("{}", token.unwrap());
/// }
/// ```
///
/// But you can work with the iterator directly as well:
/// ```
/// use jsn::{Token, TokenReader};
///
/// let mut iter = TokenReader::new("[]".as_bytes())
///     .into_iter();
///
/// assert_eq!(iter.next().unwrap(), Ok(Token::ArrayStart));
/// assert_eq!(iter.next().unwrap(), Ok(Token::ArrayEnd));
/// assert_eq!(iter.next(), None);
/// ```
///
/// See [`Tokens`](Tokens) for details on using the iterator.
#[derive(Debug, Clone)]
pub struct TokenReader<R, M> {
    input: R,
    mask: M,
    format: Format,
}

impl<R: Read> TokenReader<R, mask::All> {
    /// Creates a new `TokenReader` out of anything that implements the [`Read`](std::io::Read)
    /// trait.
    ///
    /// ## Buffering
    ///
    /// The parser makes _small_ and _repeated_ reads, so if your `Read` instance is not already in
    /// memory, you probably want to wrap it in a [`BufReader`](std::io::BufReader) first. Not
    /// doing this will result in abysymal performance.
    pub fn new(input: R) -> Self {
        Self {
            mask: mask::All,
            input,
            format: Format::default(),
        }
    }
}

impl<R: Read, M: Mask> TokenReader<R, M> {
    /// Sets the token mask the iterator will use.
    ///
    /// Token masks limit the tokens the iterator yields. The default mask is
    /// [`mask::all`](crate::mask::all). Refer to the [mask module documentation](crate::mask) for
    /// more details.
    ///
    /// Note that this is distinct from using `Iterator` methods like [`Iterator::skip_while`] or
    /// [`Iterator::filter`]. Using a mask implies the parser will not allocate heap memory for
    /// tokens (specifically string & number tokens) that do not match. Filtering using iterator
    /// methods happens after the parser has allocated heap memory for the token.
    pub fn with_mask<N: Mask>(self, mask: N) -> TokenReader<R, N> {
        TokenReader {
            input: self.input,
            mask,
            format: self.format,
        }
    }
    /// Sets the JSON format
    ///
    /// Defaults to [`Format::Regular`]
    ///
    /// The parser ignores newlines (this is not configurable), so setting this to
    /// [`Format::Concatenated`] implies the parser will also recognize newline-delimited JSON:
    ///
    /// You can use this in combination with a mask to extract tokens from every json value in the
    /// stream
    /// ```
    /// use jsn::{mask::*, TokenReader, Format};
    ///
    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// let json = r#"
    ///     { "temp": 1 }
    ///     { "temp": 2 }
    ///     { "temp": 3 }
    /// "#.as_bytes();
    ///
    /// let mut iter = TokenReader::new(json)
    ///     .with_mask(key("temp"))
    ///     .with_format(Format::Concatenated)
    ///     .into_iter();
    ///
    /// let values = iter.collect::<Result<Vec<_>, _>>()?;
    ///
    /// assert_eq!(values[0], 1);
    /// assert_eq!(values[1], 2);
    /// assert_eq!(values[2], 3);
    ///
    /// # Ok(())
    /// # }
    ///
    /// ```
    ///
    pub fn with_format(mut self, format: Format) -> Self {
        self.format = format;
        self
    }
}

impl<R: Read, M: Mask> IntoIterator for TokenReader<R, M> {
    type Item = Result<Token, JsonError>;
    type IntoIter = Tokens<R, M>;

    fn into_iter(self) -> Self::IntoIter {
        Tokens {
            input: Input::new(self.input),
            structure: JsonStructure::new(self.format),
            mask: self.mask,
            scanner: Scanner::new(),
            failure: None,
        }
    }
}

/// An iterator over JSON [`Tokens`]s.
///
/// The [`next()`](Tokens::next) method parses and returns the next token in the input.
///
/// This iterator is fallible, so its items are wrapped in a `Result`.
/// ```
/// use jsn::TokenReader;
///
/// // You can collect the tokens into a `Vec` of  `Result`s
/// let reader = TokenReader::new("[]".as_bytes());
/// let tokens = reader.into_iter().collect::<Vec<Result<_, _>>>();
/// // But also a `Result` of `Vec`s.
/// let reader = TokenReader::new("[]".as_bytes());
/// let tokens = reader.into_iter().collect::<Result<Vec<_>, _>>();
/// ```
///
/// That said, if:
/// - you are confident that your json is valid AND
/// - you are confident there won't be any errors reading the input AND
/// - you enjoy living on the edge...
///
/// ... then you can use [`Iterator::flatten`] to "ignore" the wrapped `Result`.
///
/// ```
/// use jsn::{Token, TokenReader};
///
/// let iter = TokenReader::new("[]".as_bytes())
///     .into_iter()
///     .flatten();
/// let tokens = iter.collect::<Vec<_>>();
///
/// // If the json is valid, all tokens are yielded as expected...
/// assert_eq!(tokens, vec![ Token::ArrayStart, Token::ArrayEnd ]);
///
/// let iter = TokenReader::new("[".as_bytes())
///     .into_iter()
///     .flatten();
/// let tokens = iter.collect::<Vec<_>>();
///
/// // If the json is invalid, only the tokens before the error
/// // will be yielded...
/// assert_eq!(tokens, vec![ Token::ArrayStart ]);
/// ```
#[derive(Debug)]
pub struct Tokens<R, M> {
    failure: Option<JsonError>,
    structure: JsonStructure,
    scanner: Scanner,
    input: Input<R>,
    mask: M,
}

/// Result of calling [`Tokens::dry_run`]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct DryRun {
    /// The number of tokens seen in the input
    pub token_count: usize,
    /// The number of tokens that were matched by the active token mask
    pub selected_token_count: usize,
}

impl<R: io::Read, M: Mask> Tokens<R, M> {
    /// Returns the error encountered by the iterator, if any.
    pub fn error(&self) -> Option<JsonError> {
        self.failure
    }

    /// Parse and validate whatever is left of the JSON input.
    ///
    /// Returns the number of tokens seen as well as the number of tokens that would have matched
    /// the active mask:
    ///
    /// ```
    /// use jsn::{TokenReader, Token, mask::*};
    ///
    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// let json = r#"{ "key": "value" }"#.as_bytes();
    /// let mut iter = TokenReader::new(json)
    ///     .with_mask(keys())
    ///     .into_iter();
    ///
    /// // Because of the mask, we skip the open brace
    /// let first_key = iter.next().unwrap()?;
    /// assert_eq!(first_key, "key");
    ///
    /// let dry_run = iter.dry_run()?;
    /// // What is left is the colon, string and close brace
    /// assert_eq!(dry_run.token_count, 3);
    /// assert_eq!(dry_run.selected_token_count, 0);
    ///
    /// # Ok(())
    /// # }
    /// ```
    ///
    /// This method performs no allocations for the tokens it comes across because it operates on
    /// raw tokens borrowed from the JSON input.
    ///
    /// If the input type implements [`Seek`](std::io::Seek), you can
    /// ["rewind"](crate::Tokens::reset) and start parsing from the begining again.
    ///
    /// If the iterator had previously encountered an error, that same error is returned.
    pub fn dry_run(&mut self) -> Result<DryRun, JsonError> {
        if let Some(err) = self.failure {
            return Err(err);
        }

        let mut counter = DryRun {
            token_count: 0,
            selected_token_count: 0,
        };

        loop {
            let raw_token = self
                .scanner
                .read_token(&mut self.input)
                .and_then(|t| self.structure.validate(t));

            match raw_token {
                Ok(RawToken::Eof) => return Ok(counter),
                Ok(t) => {
                    counter.token_count += 1;
                    if self.mask.match_token(&t) {
                        counter.selected_token_count += 1;
                    }
                }
                Err(reason) => {
                    let position = self.input.position();
                    let e = JsonError { position, reason };
                    self.failure = Some(e);
                    return Err(e);
                }
            };
        }
    }
}

impl<R: io::Seek, M: Mask> Tokens<R, M> {
    /// "Rewinds" the iterator to start parsing from the beginning
    ///
    /// # Errors
    ///
    /// - Fails if the iterator previously encountered an error.
    /// - Fails if `Seek` instance could not be rewound.
    pub fn reset(&mut self) -> Result<(), JsonError> {
        if let Some(err) = self.failure {
            return Err(err);
        }

        self.input.reset().map_err(|reason| JsonError {
            position: Position::default(),
            reason,
        })?;

        self.structure.reset();
        self.scanner.reset();

        Ok(())
    }
}

impl<R: io::Read, M: Mask> Iterator for Tokens<R, M> {
    type Item = Result<Token, JsonError>;

    /// Advance the iterator and return the next JSON token.
    ///
    /// The iterator will return `Some(Err(_))` when it encounters invalid JSON or fails to read
    /// from the input. Further invocations of `next()` will return `None`:
    ///
    /// ```
    /// use jsn::{TokenReader, Token};
    ///
    /// let invalid_json = "[,]".as_bytes();
    ///
    /// let mut iter = TokenReader::new(invalid_json).into_iter();
    ///
    /// assert_eq!(iter.next().unwrap(), Ok(Token::ArrayStart));
    /// assert!(iter.next().unwrap().is_err());
    /// assert_eq!(iter.next(), None);
    /// ```
    fn next(&mut self) -> Option<Self::Item> {
        if self.failure.is_some() {
            return None;
        }

        loop {
            let raw_token = self
                .scanner
                .read_token(&mut self.input)
                .and_then(|t| self.structure.validate(t));

            match raw_token {
                Ok(RawToken::Eof) => return None,
                Ok(t) => {
                    if self.mask.match_token(&t) {
                        return Some(Ok(Token::from(t)));
                    }
                }
                Err(reason) => {
                    let position = self.input.position();
                    let e = JsonError { position, reason };
                    self.failure = Some(e);
                    return Some(Err(e));
                }
            };
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::{JsonError, Reason};
    use std::io::Cursor;

    #[test]
    fn returns_none_after_failure() {
        let cursor = Cursor::new("01");

        let mut iter = TokenReader::new(cursor).into_iter();

        let expected = JsonError {
            position: Position {
                offset: 1,
                line: 1,
                col: 2,
            },
            reason: Reason::ExpectedNumber,
        };

        assert_eq!(iter.next(), Some(Err(expected)));

        // Calling .next() will begin returning `None`
        assert_eq!(iter.next(), None);
        assert_eq!(iter.next(), None);
        assert_eq!(iter.next(), None);
        assert_eq!(iter.next(), None);

        // validation should return the error
        assert_eq!(iter.dry_run(), Err(expected));

        // attempting to reset should also return the same error
        assert_eq!(iter.reset(), Err(expected));
    }
}