Available on crate feature
unstable-doc only.Expand description
§Parsing Partial Input
Typically, the input being parsed is all in-memory, or is complete. Some data sources are too
large to fit into memory, only allowing parsing an incomplete or Partial subset of the
data, requiring incrementally parsing.
By wrapping a stream, like &[u8], with Partial, parsers will report when the data is
Incomplete and more input is Needed, allowing the caller to stream-in additional data
to be parsed. The data is then parsed a chunk at a time.
Chunks are typically defined by either:
- A header reporting the number of bytes, like with
length_and_thenPartialcan explicitly be changed to being complete once the specified bytes are acquired viaStreamIsPartial::complete.
- A delimiter, like with ndjson
- You can parse up-to the delimiter or do a
take_until(0.., delim).and_then(parser)
- You can parse up-to the delimiter or do a
If the chunks are not homogeneous, a state machine will be needed to track what the expected parser is for the next chunk.
Caveats:
winnowtakes the approach of re-parsing from scratch. Chunks should be relatively small to prevent the re-parsing overhead from dominating.- Parsers like
repeatdo not know when aneofis from insufficient data or the end of the stream, causing them to always reportIncomplete.
§Example
main.rs:
ⓘ
mod parser;
use std::io::Read;
use winnow::error::ContextError;
use winnow::error::ErrMode;
use winnow::error::Needed;
use winnow::prelude::*;
use winnow::stream::Offset;
fn main() -> Result<(), lexopt::Error> {
let args = Args::parse()?;
let input = args.input.ok_or_else(|| lexopt::Error::MissingValue {
option: Some("<PATH>".to_owned()),
})?;
let mut file = std::fs::File::open(input).map_err(to_lexopt)?;
// Intentionally starting with a small buffer to make it easier to show `Incomplete` handling
let buffer_size = 10;
let min_buffer_growth = 100;
let buffer_growth_factor = 2;
let mut buffer = circular::Buffer::with_capacity(buffer_size);
loop {
let read = file.read(buffer.space()).map_err(to_lexopt)?;
eprintln!("read {read}");
if read == 0 {
// Should be EOF since we always make sure there is `available_space`
assert_ne!(buffer.available_space(), 0);
assert_eq!(
buffer.available_data(),
0,
"leftover data: {}",
String::from_utf8_lossy(buffer.data())
);
break;
}
buffer.fill(read);
loop {
let mut input =
parser::Stream::new(std::str::from_utf8(buffer.data()).map_err(to_lexopt)?);
let start = input.checkpoint();
match parser::ndjson::<ContextError>.parse_next(&mut input) {
Ok(value) => {
println!("{value:?}");
println!();
// Tell the buffer how much we read
let consumed = input.offset_from(&start);
buffer.consume(consumed);
}
Err(ErrMode::Backtrack(e)) | Err(ErrMode::Cut(e)) => {
return Err(fmt_lexopt(e.to_string()));
}
Err(ErrMode::Incomplete(Needed::Size(size))) => {
// Without the format telling us how much space is required, we really should
// treat this the same as `Unknown` but are doing this to demonstrate how to
// handle `Size`.
//
// Even when the format has a header to tell us `Size`, we could hit incidental
// `Size(1)`s, so make sure we buffer more space than that to avoid reading
// one byte at a time
let head_room = size.get().max(min_buffer_growth);
let new_capacity = buffer.available_data() + head_room;
eprintln!("growing buffer to {new_capacity}");
buffer.grow(new_capacity);
if buffer.available_space() < head_room {
eprintln!("buffer shift");
buffer.shift();
}
break;
}
Err(ErrMode::Incomplete(Needed::Unknown)) => {
let new_capacity = buffer_growth_factor * buffer.capacity();
eprintln!("growing buffer to {new_capacity}");
buffer.grow(new_capacity);
break;
}
}
}
}
Ok(())
}
#[derive(Default)]
struct Args {
input: Option<std::path::PathBuf>,
}
impl Args {
fn parse() -> Result<Self, lexopt::Error> {
use lexopt::prelude::*;
let mut res = Args::default();
let mut args = lexopt::Parser::from_env();
while let Some(arg) = args.next()? {
match arg {
Value(input) => {
res.input = Some(input.into());
}
_ => return Err(arg.unexpected()),
}
}
Ok(res)
}
}
fn to_lexopt(e: impl std::error::Error + Send + Sync + 'static) -> lexopt::Error {
lexopt::Error::Custom(Box::new(e))
}
fn fmt_lexopt(e: String) -> lexopt::Error {
lexopt::Error::Custom(e.into())
}parser.rs:
ⓘ
use std::collections::HashMap;
use std::str;
use winnow::prelude::*;
use winnow::{
ascii::float,
ascii::line_ending,
combinator::alt,
combinator::cut_err,
combinator::{delimited, preceded, separated_pair, terminated},
combinator::{repeat, separated},
error::{AddContext, ParserError, StrContext},
stream::Partial,
token::{any, none_of, take, take_while},
};
#[derive(Debug, PartialEq, Clone)]
pub(crate) enum JsonValue {
Null,
Boolean(bool),
Str(String),
Num(f64),
Array(Vec<JsonValue>),
Object(HashMap<String, JsonValue>),
}
/// Use `Partial` to cause `ErrMode::Incomplete` while parsing
pub(crate) type Stream<'i> = Partial<&'i str>;
pub(crate) fn ndjson<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
input: &mut Stream<'i>,
) -> ModalResult<Option<JsonValue>, E> {
alt((
terminated(delimited(ws, json_value, ws), line_ending).map(Some),
line_ending.value(None),
))
.parse_next(input)
}
// --Besides `WS`, same as a regular json parser ----------------------------
/// `alt` is a combinator that tries multiple parsers one by one, until
/// one of them succeeds
fn json_value<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
input: &mut Stream<'i>,
) -> ModalResult<JsonValue, E> {
// `alt` combines the each value parser. It returns the result of the first
// successful parser, or an error
alt((
null.value(JsonValue::Null),
boolean.map(JsonValue::Boolean),
string.map(JsonValue::Str),
float.map(JsonValue::Num),
array.map(JsonValue::Array),
object.map(JsonValue::Object),
))
.parse_next(input)
}
/// `literal(string)` generates a parser that takes the argument string.
///
/// This also shows returning a sub-slice of the original input
fn null<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<&'i str, E> {
// This is a parser that returns `"null"` if it sees the string "null", and
// an error otherwise
"null".parse_next(input)
}
/// We can combine `tag` with other functions, like `value` which returns a given constant value on
/// success.
fn boolean<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<bool, E> {
// This is a parser that returns `true` if it sees the string "true", and
// an error otherwise
let parse_true = "true".value(true);
// This is a parser that returns `false` if it sees the string "false", and
// an error otherwise
let parse_false = "false".value(false);
alt((parse_true, parse_false)).parse_next(input)
}
/// This parser gathers all `char`s up into a `String`with a parse to take the double quote
/// character, before the string (using `preceded`) and after the string (using `terminated`).
fn string<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
input: &mut Stream<'i>,
) -> ModalResult<String, E> {
preceded(
'\"',
// `cut_err` transforms an `ErrMode::Backtrack(e)` to `ErrMode::Cut(e)`, signaling to
// combinators like `alt` that they should not try other parsers. We were in the
// right branch (since we found the `"` character) but encountered an error when
// parsing the string
cut_err(terminated(
repeat(0.., character).fold(String::new, |mut string, c| {
string.push(c);
string
}),
'\"',
)),
)
// `context` lets you add a static string to errors to provide more information in the
// error chain (to indicate which parser had an error)
.context(StrContext::Expected("string".into()))
.parse_next(input)
}
/// You can mix the above declarative parsing with an imperative style to handle more unique cases,
/// like escaping
fn character<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<char, E> {
let c = none_of('"').parse_next(input)?;
if c == '\\' {
alt((
any.verify_map(|c| {
Some(match c {
'"' | '\\' | '/' => c,
'b' => '\x08',
'f' => '\x0C',
'n' => '\n',
'r' => '\r',
't' => '\t',
_ => return None,
})
}),
preceded('u', unicode_escape),
))
.parse_next(input)
} else {
Ok(c)
}
}
fn unicode_escape<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<char, E> {
alt((
// Not a surrogate
u16_hex
.verify(|cp| !(0xD800..0xE000).contains(cp))
.map(|cp| cp as u32),
// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
separated_pair(u16_hex, "\\u", u16_hex)
.verify(|(high, low)| (0xD800..0xDC00).contains(high) && (0xDC00..0xE000).contains(low))
.map(|(high, low)| {
let high_ten = (high as u32) - 0xD800;
let low_ten = (low as u32) - 0xDC00;
(high_ten << 10) + low_ten + 0x10000
}),
))
.verify_map(
// Could be probably replaced with .unwrap() or _unchecked due to the verify checks
std::char::from_u32,
)
.parse_next(input)
}
fn u16_hex<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<u16, E> {
take(4usize)
.verify_map(|s| u16::from_str_radix(s, 16).ok())
.parse_next(input)
}
/// Some combinators, like `separated` or `repeat`, will call a parser repeatedly,
/// accumulating results in a `Vec`, until it encounters an error.
/// If you want more control on the parser application, check out the `iterator`
/// combinator (cf `examples/iterator.rs`)
fn array<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
input: &mut Stream<'i>,
) -> ModalResult<Vec<JsonValue>, E> {
preceded(
('[', ws),
cut_err(terminated(
separated(0.., json_value, (ws, ',', ws)),
(ws, ']'),
)),
)
.context(StrContext::Expected("array".into()))
.parse_next(input)
}
fn object<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
input: &mut Stream<'i>,
) -> ModalResult<HashMap<String, JsonValue>, E> {
preceded(
('{', ws),
cut_err(terminated(
separated(0.., key_value, (ws, ',', ws)),
(ws, '}'),
)),
)
.context(StrContext::Expected("object".into()))
.parse_next(input)
}
fn key_value<'i, E: ParserError<Stream<'i>> + AddContext<Stream<'i>, StrContext>>(
input: &mut Stream<'i>,
) -> ModalResult<(String, JsonValue), E> {
separated_pair(string, cut_err((ws, ':', ws)), json_value).parse_next(input)
}
/// Parser combinators are constructed from the bottom up:
/// first we write parsers for the smallest elements (here a space character),
/// then we'll combine them in larger parsers
fn ws<'i, E: ParserError<Stream<'i>>>(input: &mut Stream<'i>) -> ModalResult<&'i str, E> {
// Combinators like `take_while` return a function. That function is the
// parser,to which we can pass the input
take_while(0.., WS).parse_next(input)
}
const WS: &[char] = &[' ', '\t'];
#[cfg(test)]
mod test {
#[allow(clippy::useless_attribute)]
#[allow(unused_imports)] // its dead for benches
use super::*;
#[allow(clippy::useless_attribute)]
#[allow(dead_code)] // its dead for benches
type Error = winnow::error::ContextError;
#[test]
fn json_string() {
assert_eq!(
string::<Error>.parse_peek(Partial::new("\"\"")),
Ok((Partial::new(""), "".to_owned()))
);
assert_eq!(
string::<Error>.parse_peek(Partial::new("\"abc\"")),
Ok((Partial::new(""), "abc".to_owned()))
);
assert_eq!(
string::<Error>.parse_peek(Partial::new(
"\"abc\\\"\\\\\\/\\b\\f\\n\\r\\t\\u0001\\u2014\u{2014}def\""
)),
Ok((
Partial::new(""),
"abc\"\\/\x08\x0C\n\r\t\x01——def".to_owned()
)),
);
assert_eq!(
string::<Error>.parse_peek(Partial::new("\"\\uD83D\\uDE10\"")),
Ok((Partial::new(""), "😐".to_owned()))
);
assert!(string::<Error>.parse_peek(Partial::new("\"")).is_err());
assert!(string::<Error>.parse_peek(Partial::new("\"abc")).is_err());
assert!(string::<Error>.parse_peek(Partial::new("\"\\\"")).is_err());
assert!(string::<Error>
.parse_peek(Partial::new("\"\\u123\""))
.is_err());
assert!(string::<Error>
.parse_peek(Partial::new("\"\\uD800\""))
.is_err());
assert!(string::<Error>
.parse_peek(Partial::new("\"\\uD800\\uD800\""))
.is_err());
assert!(string::<Error>
.parse_peek(Partial::new("\"\\uDC00\""))
.is_err());
}
#[test]
fn json_object() {
use JsonValue::{Num, Object, Str};
let input = r#"{"a":42,"b":"x"}
"#;
let expected = Object(
vec![
("a".to_owned(), Num(42.0)),
("b".to_owned(), Str("x".to_owned())),
]
.into_iter()
.collect(),
);
assert_eq!(
ndjson::<Error>.parse_peek(Partial::new(input)),
Ok((Partial::new(""), Some(expected)))
);
}
#[test]
fn json_array() {
use JsonValue::{Array, Num, Str};
let input = r#"[42,"x"]
"#;
let expected = Array(vec![Num(42.0), Str("x".to_owned())]);
assert_eq!(
ndjson::<Error>.parse_peek(Partial::new(input)),
Ok((Partial::new(""), Some(expected)))
);
}
#[test]
fn json_whitespace() {
use JsonValue::{Array, Boolean, Null, Num, Object, Str};
let input = r#" { "null" : null, "true" :true , "false": false , "number" : 123e4 , "string" : " abc 123 " , "array" : [ false , 1 , "two" ] , "object" : { "a" : 1.0 , "b" : "c" } , "empty_array" : [ ] , "empty_object" : { } }
"#;
assert_eq!(
ndjson::<Error>.parse_peek(Partial::new(input)),
Ok((
Partial::new(""),
Some(Object(
vec![
("null".to_owned(), Null),
("true".to_owned(), Boolean(true)),
("false".to_owned(), Boolean(false)),
("number".to_owned(), Num(123e4)),
("string".to_owned(), Str(" abc 123 ".to_owned())),
(
"array".to_owned(),
Array(vec![Boolean(false), Num(1.0), Str("two".to_owned())])
),
(
"object".to_owned(),
Object(
vec![
("a".to_owned(), Num(1.0)),
("b".to_owned(), Str("c".to_owned())),
]
.into_iter()
.collect()
)
),
("empty_array".to_owned(), Array(vec![]),),
("empty_object".to_owned(), Object(HashMap::new()),),
]
.into_iter()
.collect()
))
))
);
}
}