Module partial

Available on crate feature unstable-doc only.

Expand description

§Parsing Partial Input

Typically, the input being parsed is all in-memory, or is complete. Some data sources are too large to fit into memory, only allowing parsing an incomplete or Partial subset of the data, requiring incrementally parsing.

By wrapping a stream, like &[u8], with Partial, parsers will report when the data is Incomplete and more input is Needed, allowing the caller to stream-in additional data to be parsed. The data is then parsed a chunk at a time.

Chunks are typically defined by either:

A header reporting the number of bytes, like with length_and_then
- Partial can explicitly be changed to being complete once the specified bytes are acquired via StreamIsPartial::complete.
A delimiter, like with ndjson
- You can parse up-to the delimiter or do a take_until(0.., delim).and_then(parser)

If the chunks are not homogeneous, a state machine will be needed to track what the expected parser is for the next chunk.

Caveats:

winnow takes the approach of re-parsing from scratch. Chunks should be relatively small to prevent the re-parsing overhead from dominating.
Parsers like repeat do not know when an eof is from insufficient data or the end of the stream, causing them to always report Incomplete.

§Example

main.rs:

mod parser;

use std::io::Read;

use winnow::error::ContextError;
use winnow::error::ErrMode;
use winnow::error::Needed;
use winnow::prelude::*;
use winnow::stream::Offset;

fn main() -> Result<(), lexopt::Error> {
    let args = Args::parse()?;
    let input = args.input.ok_or_else(|| lexopt::Error::MissingValue {
        option: Some("<PATH>".to_owned()),
    })?;

    let mut file = std::fs::File::open(input).map_err(to_lexopt)?;

    // Intentionally starting with a small buffer to make it easier to show `Incomplete` handling
    let buffer_size = 10;
    let min_buffer_growth = 100;
    let buffer_growth_factor = 2;
    let mut buffer = circular::Buffer::with_capacity(buffer_size);
    loop {
        let read = file.read(buffer.space()).map_err(to_lexopt)?;
        eprintln!("read {read}");
        if read == 0 {
            // Should be EOF since we always make sure there is `available_space`
            assert_ne!(buffer.available_space(), 0);
            assert_eq!(
                buffer.available_data(),
                0,
                "leftover data: {}",
                String::from_utf8_lossy(buffer.data())
            );
            break;
        }
        buffer.fill(read);

        loop {
            let mut input =
                parser::Stream::new(std::str::from_utf8(buffer.data()).map_err(to_lexopt)?);
            let start = input.checkpoint();
            match parser::ndjson::<ContextError>.parse_next(&mut input) {
                Ok(value) => {
                    println!("{value:?}");
                    println!();
                    // Tell the buffer how much we read
                    let consumed = input.offset_from(&start);
                    buffer.consume(consumed);
                }
                Err(ErrMode::Backtrack(e)) | Err(ErrMode::Cut(e)) => {
                    return Err(fmt_lexopt(e.to_string()));
                }
                Err(ErrMode::Incomplete(Needed::Size(size))) => {
                    // Without the format telling us how much space is required, we really should
                    // treat this the same as `Unknown` but are doing this to demonstrate how to
                    // handle `Size`.
                    //
                    // Even when the format has a header to tell us `Size`, we could hit incidental
                    // `Size(1)`s, so make sure we buffer more space than that to avoid reading
                    // one byte at a time
                    let head_room = size.get().max(min_buffer_growth);
                    let new_capacity = buffer.available_data() + head_room;
                    eprintln!("growing buffer to {new_capacity}");
                    buffer.grow(new_capacity);
                    if buffer.available_space() < head_room {
                        eprintln!("buffer shift");
                        buffer.shift();
                    }
                    break;
                }
                Err(ErrMode::Incomplete(Needed::Unknown)) => {
                    let new_capacity = buffer_growth_factor * buffer.capacity();
                    eprintln!("growing buffer to {new_capacity}");
                    buffer.grow(new_capacity);
                    break;
                }
            }
        }
    }

    Ok(())
}

#[derive(Default)]
struct Args {
    input: Option<std::path::PathBuf>,
}

impl Args {
    fn parse() -> Result<Self, lexopt::Error> {
        use lexopt::prelude::*;

        let mut res = Args::default();

        let mut args = lexopt::Parser::from_env();
        while let Some(arg) = args.next()? {
            match arg {
                Value(input) => {
                    res.input = Some(input.into());
                }
                _ => return Err(arg.unexpected()),
            }
        }
        Ok(res)
    }
}

fn to_lexopt(e: impl std::error::Error + Send + Sync + 'static) -> lexopt::Error {
    lexopt::Error::Custom(Box::new(e))
}

fn fmt_lexopt(e: String) -> lexopt::Error {
    lexopt::Error::Custom(e.into())
}

parser.rs:

use std::collections::HashMap;
use std::str;

use winnow::prelude::*;
use winnow::{
    ascii::float,
    ascii::line_ending,
    combinator::alt,
    combinator::cut_err,
    combinator::{delimited, preceded, separated_pair, terminated},
    combinator::{repeat, separated},
    error::{AddContext, ParserError, StrContext},
    stream::Partial,
    token::{any, none_of, take, take_while},
};

#[derive(Debug, PartialEq, Clone)]
pub(crate) enum JsonValue {
    Null,
    Boolean(bool),
    Str(String),
    Num(f64),
    Array(Vec<JsonValue>),
    Object(HashMap<String, JsonValue>),
}

/// Use `Partial` to cause `ErrMode::Incomplete` while parsing
pub(crate) type Stream<'i> = Partial<&'i str>;

pub(crate) fn ndjson<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
    input: &mut Stream<'i>,
) -> ModalResult<Option<JsonValue>, E> {
    alt((
        terminated(delimited(ws, json_value, ws), line_ending).map(Some),
        line_ending.value(None),
    ))
    .parse_next(input)
}

// --Besides `WS`, same as a regular json parser ----------------------------

/// `alt` is a combinator that tries multiple parsers one by one, until
/// one of them succeeds
fn json_value<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
    input: &mut Stream<'i>,
) -> ModalResult<JsonValue, E> {
    // `alt` combines the each value parser. It returns the result of the first
    // successful parser, or an error
    alt((
        null.value(JsonValue::Null),
        boolean.map(JsonValue::Boolean),
        string.map(JsonValue::Str),
        float.map(JsonValue::Num),
        array.map(JsonValue::Array),
        object.map(JsonValue::Object),
    ))
    .parse_next(input)
}

/// `literal(string)` generates a parser that takes the argument string.
///
/// This also shows returning a sub-slice of the original input
fn null<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<&'i str, E> {
    // This is a parser that returns `"null"` if it sees the string "null", and
    // an error otherwise
    "null".parse_next(input)
}

/// We can combine `tag` with other functions, like `value` which returns a given constant value on
/// success.
fn boolean<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<bool, E> {
    // This is a parser that returns `true` if it sees the string "true", and
    // an error otherwise
    let parse_true = "true".value(true);

    // This is a parser that returns `false` if it sees the string "false", and
    // an error otherwise
    let parse_false = "false".value(false);

    alt((parse_true, parse_false)).parse_next(input)
}

/// This parser gathers all `char`s up into a `String`with a parse to take the double quote
/// character, before the string (using `preceded`) and after the string (using `terminated`).
fn string<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
    input: &mut Stream<'i>,
) -> ModalResult<String, E> {
    preceded(
        '\"',
        // `cut_err` transforms an `ErrMode::Backtrack(e)` to `ErrMode::Cut(e)`, signaling to
        // combinators like  `alt` that they should not try other parsers. We were in the
        // right branch (since we found the `"` character) but encountered an error when
        // parsing the string
        cut_err(terminated(
            repeat(0.., character).fold(String::new, |mut string, c| {
                string.push(c);
                string
            }),
            '\"',
        )),
    )
    // `context` lets you add a static string to errors to provide more information in the
    // error chain (to indicate which parser had an error)
    .context(StrContext::Expected("string".into()))
    .parse_next(input)
}

/// You can mix the above declarative parsing with an imperative style to handle more unique cases,
/// like escaping
fn character<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<char, E> {
    let c = none_of('"').parse_next(input)?;
    if c == '\\' {
        alt((
            any.verify_map(|c| {
                Some(match c {
                    '"' | '\\' | '/' => c,
                    'b' => '\x08',
                    'f' => '\x0C',
                    'n' => '\n',
                    'r' => '\r',
                    't' => '\t',
                    _ => return None,
                })
            }),
            preceded('u', unicode_escape),
        ))
        .parse_next(input)
    } else {
        Ok(c)
    }
}

fn unicode_escape<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<char, E> {
    alt((
        // Not a surrogate
        u16_hex
            .verify(|cp| !(0xD800..0xE000).contains(cp))
            .map(|cp| cp as u32),
        // See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
        separated_pair(u16_hex, "\\u", u16_hex)
            .verify(|(high, low)| (0xD800..0xDC00).contains(high) && (0xDC00..0xE000).contains(low))
            .map(|(high, low)| {
                let high_ten = (high as u32) - 0xD800;
                let low_ten = (low as u32) - 0xDC00;
                (high_ten << 10) + low_ten + 0x10000
            }),
    ))
    .verify_map(
        // Could be probably replaced with .unwrap() or _unchecked due to the verify checks
        std::char::from_u32,
    )
    .parse_next(input)
}

fn u16_hex<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<u16, E> {
    take(4usize)
        .verify_map(|s| u16::from_str_radix(s, 16).ok())
        .parse_next(input)
}

/// Some combinators, like `separated` or `repeat`, will call a parser repeatedly,
/// accumulating results in a `Vec`, until it encounters an error.
/// If you want more control on the parser application, check out the `iterator`
/// combinator (cf `examples/iterator.rs`)
fn array<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
    input: &mut Stream<'i>,
) -> ModalResult<Vec<JsonValue>, E> {
    preceded(
        ('[', ws),
        cut_err(terminated(
            separated(0.., json_value, (ws, ',', ws)),
            (ws, ']'),
        )),
    )
    .context(StrContext::Expected("array".into()))
    .parse_next(input)
}

fn object<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
    input: &mut Stream<'i>,
) -> ModalResult<HashMap<String, JsonValue>, E> {
    preceded(
        ('{', ws),
        cut_err(terminated(
            separated(0.., key_value, (ws, ',', ws)),
            (ws, '}'),
        )),
    )
    .context(StrContext::Expected("object".into()))
    .parse_next(input)
}

fn key_value<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
    input: &mut Stream<'i>,
) -> ModalResult<(String, JsonValue), E> {
    separated_pair(string, cut_err((ws, ':', ws)), json_value).parse_next(input)
}

/// Parser combinators are constructed from the bottom up:
/// first we write parsers for the smallest elements (here a space character),
/// then we'll combine them in larger parsers
fn ws<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<&'i str, E> {
    // Combinators like `take_while` return a function. That function is the
    // parser,to which we can pass the input
    take_while(0.., WS).parse_next(input)
}

const WS: &[char] = &[' ', '\t'];

#[cfg(test)]
mod test {
    #[allow(clippy::useless_attribute)]
    #[allow(unused_imports)] // its dead for benches
    use super::*;

    #[allow(clippy::useless_attribute)]
    #[allow(dead_code)] // its dead for benches
    type Error = winnow::error::ContextError;

    #[test]
    fn json_string() {
        assert_eq!(
            string::<Error>.parse_peek(Partial::new("\"\"")),
            Ok((Partial::new(""), "".to_owned()))
        );
        assert_eq!(
            string::<Error>.parse_peek(Partial::new("\"abc\"")),
            Ok((Partial::new(""), "abc".to_owned()))
        );
        assert_eq!(
            string::<Error>.parse_peek(Partial::new(
                "\"abc\\\"\\\\\\/\\b\\f\\n\\r\\t\\u0001\\u2014\u{2014}def\""
            )),
            Ok((
                Partial::new(""),
                "abc\"\\/\x08\x0C\n\r\t\x01——def".to_owned()
            )),
        );
        assert_eq!(
            string::<Error>.parse_peek(Partial::new("\"\\uD83D\\uDE10\"")),
            Ok((Partial::new(""), "😐".to_owned()))
        );

        assert!(string::<Error>.parse_peek(Partial::new("\"")).is_err());
        assert!(string::<Error>.parse_peek(Partial::new("\"abc")).is_err());
        assert!(string::<Error>.parse_peek(Partial::new("\"\\\"")).is_err());
        assert!(string::<Error>
            .parse_peek(Partial::new("\"\\u123\""))
            .is_err());
        assert!(string::<Error>
            .parse_peek(Partial::new("\"\\uD800\""))
            .is_err());
        assert!(string::<Error>
            .parse_peek(Partial::new("\"\\uD800\\uD800\""))
            .is_err());
        assert!(string::<Error>
            .parse_peek(Partial::new("\"\\uDC00\""))
            .is_err());
    }

    #[test]
    fn json_object() {
        use JsonValue::{Num, Object, Str};

        let input = r#"{"a":42,"b":"x"}
"#;

        let expected = Object(
            vec![
                ("a".to_owned(), Num(42.0)),
                ("b".to_owned(), Str("x".to_owned())),
            ]
            .into_iter()
            .collect(),
        );

        assert_eq!(
            ndjson::<Error>.parse_peek(Partial::new(input)),
            Ok((Partial::new(""), Some(expected)))
        );
    }

    #[test]
    fn json_array() {
        use JsonValue::{Array, Num, Str};

        let input = r#"[42,"x"]
"#;

        let expected = Array(vec![Num(42.0), Str("x".to_owned())]);

        assert_eq!(
            ndjson::<Error>.parse_peek(Partial::new(input)),
            Ok((Partial::new(""), Some(expected)))
        );
    }

    #[test]
    fn json_whitespace() {
        use JsonValue::{Array, Boolean, Null, Num, Object, Str};

        let input = r#"  {    "null" : null,    "true"  :true ,    "false":  false  ,    "number" : 123e4 ,    "string" : " abc 123 " ,    "array" : [ false , 1 , "two" ] ,    "object" : { "a" : 1.0 , "b" : "c" } ,    "empty_array" : [  ] ,    "empty_object" : {   }  }  
"#;

        assert_eq!(
            ndjson::<Error>.parse_peek(Partial::new(input)),
            Ok((
                Partial::new(""),
                Some(Object(
                    vec![
                        ("null".to_owned(), Null),
                        ("true".to_owned(), Boolean(true)),
                        ("false".to_owned(), Boolean(false)),
                        ("number".to_owned(), Num(123e4)),
                        ("string".to_owned(), Str(" abc 123 ".to_owned())),
                        (
                            "array".to_owned(),
                            Array(vec![Boolean(false), Num(1.0), Str("two".to_owned())])
                        ),
                        (
                            "object".to_owned(),
                            Object(
                                vec![
                                    ("a".to_owned(), Num(1.0)),
                                    ("b".to_owned(), Str("c".to_owned())),
                                ]
                                .into_iter()
                                .collect()
                            )
                        ),
                        ("empty_array".to_owned(), Array(vec![]),),
                        ("empty_object".to_owned(), Object(HashMap::new()),),
                    ]
                    .into_iter()
                    .collect()
                ))
            ))
        );
    }
}

Module partial

Module partial Copy item path

§Parsing Partial Input

§Example

Module partial