extern crate nom;
use nom::error::ErrorKind;
use std::str::from_utf8_unchecked;
use nom::branch::alt;
use nom::bytes::complete::{is_not, tag};
use nom::character::complete::{digit1, multispace0};
use nom::combinator::{map, opt};
use nom::multi::many0;
use nom::number::complete::double;
use nom::sequence::{delimited, preceded, terminated, tuple};
use nom::IResult;
use crate::dataframe::Data;
use crate::schema::DataType;
#[inline(always)]
fn left_angle_bracket(i: &[u8]) -> IResult<&[u8], &[u8]> {
terminated(tag("<"), multispace0)(i)
}
#[inline(always)]
fn right_angle_bracket(i: &[u8]) -> IResult<&[u8], &[u8]> {
preceded(multispace0, tag(">"))(i)
}
#[inline(always)]
fn parse_bool(i: &[u8]) -> IResult<&[u8], Data> {
let (remaining_input, b) = alt((tag("1"), tag("0")))(i)?;
match b {
b"1" => Ok((remaining_input, Data::Bool(true))),
b"0" => Ok((remaining_input, Data::Bool(false))),
_ => unreachable!(),
}
}
#[inline(always)]
fn parse_delimited_bool(i: &[u8]) -> IResult<&[u8], Data> {
delimited(left_angle_bracket, parse_bool, right_angle_bracket)(i)
}
#[inline(always)]
fn parse_int(i: &[u8]) -> IResult<&[u8], Data> {
let (remaining_input, (sign, number)) =
tuple((opt(alt((tag("+"), tag("-")))), digit1))(i)?;
let multiplier = match sign {
None => 1,
Some(b"+") => 1,
Some(b"-") => -1,
_ => unreachable!(),
};
let num = unsafe { from_utf8_unchecked(number) }.parse::<i64>();
match num {
Ok(n) => Ok((remaining_input, Data::Int(n * multiplier))),
Err(_) => Err(nom::Err::Error((i, ErrorKind::Digit))),
}
}
#[inline(always)]
fn parse_delimited_int(i: &[u8]) -> IResult<&[u8], Data> {
delimited(left_angle_bracket, parse_int, right_angle_bracket)(i)
}
#[inline(always)]
fn parse_string(i: &[u8]) -> IResult<&[u8], Data> {
map(
alt((delimited(tag("\""), is_not("\""), tag("\"")), is_not(" >"))),
|s: &[u8]| {
Data::String(match s {
b"\"\"" => String::from(""),
_ => String::from(unsafe { from_utf8_unchecked(s) }),
})
},
)(i)
}
#[inline(always)]
fn parse_delimited_string(i: &[u8]) -> IResult<&[u8], Data> {
delimited(left_angle_bracket, parse_string, right_angle_bracket)(i)
}
#[inline(always)]
fn parse_float(i: &[u8]) -> IResult<&[u8], Data> {
map(double, Data::Float)(i)
}
#[inline(always)]
fn parse_delimited_float(i: &[u8]) -> IResult<&[u8], Data> {
delimited(left_angle_bracket, parse_float, right_angle_bracket)(i)
}
#[inline(always)]
fn parse_null(i: &[u8]) -> IResult<&[u8], Data> {
map(multispace0, |_| Data::Null)(i)
}
#[inline(always)]
fn parse_delimited_null(i: &[u8]) -> IResult<&[u8], Data> {
delimited(left_angle_bracket, parse_null, right_angle_bracket)(i)
}
fn parse_field(i: &[u8]) -> IResult<&[u8], Data> {
alt((
parse_delimited_null,
parse_delimited_bool,
parse_delimited_int,
parse_delimited_float,
parse_delimited_string,
))(i)
}
pub fn parse_line(i: &[u8]) -> Option<Vec<Data>> {
let (remaining_input, data) =
many0(delimited(multispace0, parse_field, multispace0))(i).unwrap();
if remaining_input != b"" {
None
} else {
Some(data)
}
}
fn my_multispace(i: &[u8]) -> IResult<&[u8], &[u8]> {
multispace0(i)
}
pub fn parse_line_with_schema(
i: &[u8],
schema: &Vec<DataType>,
) -> Option<Vec<Data>> {
if i.is_empty() {
return None;
};
let mut result: Vec<Data> = Vec::with_capacity(schema.len() + 1);
let mut remaining_input = i;
for column_type in schema {
let (x, _) = my_multispace(remaining_input).unwrap();
remaining_input = x;
if remaining_input == b"" {
result.push(Data::Null);
continue;
}
match parse_delimited_null(remaining_input) {
Ok((rem, d)) => {
remaining_input = rem;
result.push(d);
}
_ => match &column_type {
DataType::String => {
match parse_delimited_string(remaining_input) {
Ok((x, d)) => {
result.push(d);
remaining_input = x;
}
_ => return None,
}
}
DataType::Float => match parse_delimited_float(remaining_input)
{
Ok((x, d)) => {
result.push(d);
remaining_input = x;
}
_ => return None,
},
DataType::Int => match parse_delimited_int(remaining_input) {
Ok((x, d)) => {
result.push(d);
remaining_input = x;
}
_ => return None,
},
DataType::Bool => match parse_delimited_bool(remaining_input) {
Ok((x, d)) => {
result.push(d);
remaining_input = x;
}
_ => return None,
},
},
}
}
Some(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_string() {
let x = parse_string(b"\"hello world\"");
assert_eq!(x.unwrap().1, Data::String("hello world".to_string()));
let x = parse_string(b"\" \"");
assert_eq!(x.unwrap().1, Data::String(" ".to_string()));
let x = parse_string(b"\"\"");
assert_eq!(x.unwrap().1, Data::String("".to_string()));
let x = parse_string(b"hello");
assert_eq!(x.unwrap().1, Data::String("hello".to_string()));
let x = parse_string(b"hello world");
assert_eq!(x.unwrap().1, Data::String("hello".to_string()));
}
#[test]
fn test_parse_bool() {
let x = parse_bool(b"1");
assert_eq!(x.unwrap().1, Data::Bool(true));
let y = parse_bool(b"0");
assert_eq!(y.unwrap().1, Data::Bool(false));
}
#[test]
fn test_parse_int() {
let x = parse_int(b"+123");
assert_eq!(x.unwrap().1, Data::Int(123));
let y = parse_int(b"-123");
assert_eq!(y.unwrap().1, Data::Int(-123));
let z = parse_int(b"123");
assert_eq!(z.unwrap().1, Data::Int(123));
let w = parse_int(b"01");
assert_eq!(w.unwrap().1, Data::Int(1));
}
#[test]
fn test_parse_float() {
let x = parse_float(b"69E-01");
assert_eq!(x.unwrap().1, Data::Float(6.9));
let y = parse_float(b"-2.2");
assert_eq!(y.unwrap().1, Data::Float(-2.2));
let z = parse_float(b"2.2");
assert_eq!(z.unwrap().1, Data::Float(2.2));
let z = parse_float(b"4.20E+2");
assert_eq!(z.unwrap().1, Data::Float(420.0));
}
#[test]
fn test_parse_field() {
let s = parse_field(b"< hello >");
assert_eq!(s.unwrap().1, Data::String("hello".to_string()));
let i = parse_field(b"<123>");
assert_eq!(i.unwrap().1, Data::Int(123));
let f = parse_field(b"< 123.123 >");
assert_eq!(f.unwrap().1, Data::Float(123.123));
let b = parse_field(b"< 1 >");
assert_eq!(b.unwrap().1, Data::Bool(true));
let n = parse_field(b"< >");
assert_eq!(n.unwrap().1, Data::Null);
let n2 = parse_field(b"<>");
assert_eq!(n2.unwrap().1, Data::Null);
}
#[test]
fn test_parse_line() {
let line = parse_line(b"< hello > <123> <123.123> <> <1>");
assert_eq!(
line,
Some(vec![
Data::String("hello".to_string()),
Data::Int(123),
Data::Float(123.123),
Data::Null,
Data::Bool(true)
])
);
let line = parse_line(b"< hello > <123> <123.123> <> <1>");
assert_eq!(
line,
Some(vec![
Data::String("hello".to_string()),
Data::Int(123),
Data::Float(123.123),
Data::Null,
Data::Bool(true)
])
);
let empty = parse_line(b"");
assert_eq!(empty, Some(vec![]));
let i = parse_line(b"<123>");
assert_eq!(i, Some(vec![Data::Int(123)]));
let failing = parse_line(b"<1. 0>");
assert_eq!(failing, None);
let failing2 = parse_line(b"<bye world>");
assert_eq!(failing2, None);
let failing3 = parse_line(b"<+ 1>");
assert_eq!(failing3, None);
}
#[test]
fn test_parse_line_with_schema() {
let schema = vec![
DataType::String,
DataType::Int,
DataType::Float,
DataType::String,
DataType::Bool,
];
let line = parse_line_with_schema(
b" < hello > <123> <123.123> <> <1> ",
&schema,
);
assert_eq!(
line,
Some(vec![
Data::String("hello".to_string()),
Data::Int(123),
Data::Float(123.123),
Data::Null,
Data::Bool(true)
])
);
let string_variants = parse_line_with_schema(
b"< \"hi world\" > <+2> <1.1> <\" hi \"> <0> ",
&schema,
);
assert_eq!(
string_variants,
Some(vec![
Data::String("hi world".to_string()),
Data::Int(2),
Data::Float(1.1),
Data::String(" hi ".to_string()),
Data::Bool(false)
])
);
let string_variants2 = parse_line_with_schema(
b"< \"<>\" > <-2> <1.19999> <<> <0> ",
&schema,
);
assert_eq!(
string_variants2,
Some(vec![
Data::String("<>".to_string()),
Data::Int(-2),
Data::Float(1.19999),
Data::String("<".to_string()),
Data::Bool(false)
])
);
let parse_schema_precedence =
parse_line_with_schema(b"<1> <1> <1.0> <1> <1>", &schema);
assert_eq!(
parse_schema_precedence,
Some(vec![
Data::String("1".to_string()),
Data::Int(1),
Data::Float(1.0),
Data::String("1".to_string()),
Data::Bool(true),
])
);
}
#[test]
fn test_parse_line_with_schema_and_missing_fields() {
let schema = vec![DataType::String, DataType::Int, DataType::Float];
let parse_explicit_missing =
parse_line_with_schema(b"<> <-1> <>", &schema);
assert_eq!(
parse_explicit_missing,
Some(vec![Data::Null, Data::Int(-1), Data::Null,])
);
let implicit_missing_at_end =
parse_line_with_schema(b"<bye> <223> ", &schema);
assert_eq!(
implicit_missing_at_end,
Some(vec![
Data::String("bye".to_string()),
Data::Int(223),
Data::Null,
])
);
let too_many_fields_not_discarded = parse_line_with_schema(
b"<bye> <223> <1.123> <> <1> <extra_field>",
&schema,
);
assert_eq!(
too_many_fields_not_discarded,
Some(vec![
Data::String("bye".to_string()),
Data::Int(223),
Data::Float(1.123),
])
);
}
#[test]
fn test_parsing_bad_lines_with_schema() {
let schema = vec![
DataType::String,
DataType::Int,
DataType::Float,
DataType::String,
DataType::Bool,
];
let bad_string = parse_line_with_schema(
b"< hi world > <+2> <1.1> <\" hi \"> <0> ",
&schema,
);
assert_eq!(bad_string, None);
let bad_row_wrong_types =
parse_line_with_schema(b"<world> <1.2> <123> <1> <0>", &schema);
assert_eq!(bad_row_wrong_types, None);
let empty = parse_line_with_schema(b"", &schema);
assert_eq!(empty, None);
}
}