c0sv 0.2.0

Binary CSV, using C0 ASCII control codes
Documentation
use alloc::borrow::Cow;
use alloc::vec::Vec;
use core::slice;
use nom::bytes::streaming::{tag, take_while, take};
use nom::combinator::{peek, opt};
use nom::branch::alt;
use nom::IResult;

pub const START_OF_HEADING: u8 = 0x01;
pub const START_OF_TEXT: u8 = 0x02;
pub const END_OF_TEXT: u8 = 0x03;
pub const ESCAPE: u8 = 0x1B;
pub const RECORD_SEPARATOR: u8 = 0x1E;
pub const UNIT_SEPARATOR: u8 = 0x1F;

#[derive(Debug)]
pub enum EndType {
    EndOfHeader,
    EndOfRecord,
    EndOfDocument,
}

#[derive(Debug)]
pub struct Unit<'a>(pub Cow<'a, [u8]>);

#[derive(Debug)]
pub struct Units<'a>{
    pub end_type: EndType,
    pub units: Vec<Unit<'a>>,
}

#[derive(Debug)]
pub struct Heading<'a>(pub Units<'a>);

#[derive(Debug)]
pub struct Document<'a>{
    pub heading: Option<Heading<'a>>,
    pub records: Vec<Units<'a>>,
}

pub fn is_control(input: u8) -> bool {
    matches!(
        input,
        START_OF_HEADING | START_OF_TEXT | END_OF_TEXT | ESCAPE | RECORD_SEPARATOR | UNIT_SEPARATOR
    )
}

/** Parse input into a unit, which may or may not be owned.
 * 
 * Will process escapes, but any and all other control characters are left unprocessed, expected to
 * be handled at another stage.  When successful, will always leave one non-escape control
 * character in the stream.
 */
pub fn parse_unit(input: &[u8]) -> IResult<&[u8], Unit<'_>> {
    let (input, unit) = take_while(|byte| !is_control(byte))(input)?;
    let (input, mut control) = peek(take(1u8))(input)?;
    if control[0] != ESCAPE {
        return Ok((input, Unit(unit.into())));
    }
    let mut unit: Vec<u8> = Vec::from(unit);
    let mut input = input;
    while control[0] == ESCAPE {
        // We peeked, so we know that input has at least 1 character at this point.
        input = &input[1..];
        let ret = take(1u8)(input)?;
        input = ret.0;
        unit.push(ret.1[0]);

        let ret = take_while(|byte| !is_control(byte))(input)?;
        input = ret.0;
        unit.extend(ret.1);

        control = peek(take(1u8))(input)?.1;
    }
    return Ok((input, Unit(unit.into())));
}

/** Parse input into a set of units, each of which may or may not be owned.
 * 
 * Will process escapes and unit separators, but any and all other control characters are left
 * unprocessed, expected to be handled at another stage.
 *
 * When successful, will always leave one non-escape control character in the stream.
 */
pub fn parse_units(mut input: &[u8]) -> IResult<&[u8], Units<'_>> {
    let mut output = Vec::new();
    loop {
        let (inner_input, unit) = parse_unit(input)?;
        output.push(unit);
        match inner_input[0] {
            RECORD_SEPARATOR => return Ok((inner_input, Units {
                units: output,
                end_type: EndType::EndOfRecord,
            })),
            START_OF_TEXT => return Ok((inner_input, Units {
                units: output,
                end_type: EndType::EndOfHeader,
            })),
            END_OF_TEXT => return Ok((inner_input, Units {
                units: output,
                end_type: EndType::EndOfDocument,
            })),
            _ => (),
        }
        // Strip out the UNIT_SEPARATOR
        input = &inner_input[1..];
    }
}

pub fn parse_heading(input: &[u8]) -> IResult<&[u8], Heading<'_>> {
    let (input, _) = tag(slice::from_ref(&START_OF_HEADING))(input)?;
    let (input, units) = parse_units(input)?;
    return Ok((input, Heading(units)));
}

/** Parse a body of records from a document.
 *
 * Used in service of parse_document.  This will not parse a header, but will parse the entire body
 * of a body document.
 */
pub fn parse_records(input: &[u8]) -> IResult<&[u8], Vec<Units<'_>>> {
    let (mut input, _) = tag(slice::from_ref(&START_OF_TEXT))(input)?;
    let mut records = Vec::new();
    loop {
        let (inner_input, units) = parse_units(input)?;
        records.push(units);
        let (inner_input, control) = alt((
                tag(slice::from_ref(&RECORD_SEPARATOR)),
                tag(slice::from_ref(&END_OF_TEXT)),
        ))(inner_input)?;
        if control[0] == END_OF_TEXT {
            return Ok((inner_input, records));
        }
        input = inner_input;
    }
}

/** Parse an entire document.
 *
 * Useful if you already have the entire document in memory or if you know it will all fit in
 * memory, otherwise stream the heading and all the records.
 */
pub fn parse_document(input: &[u8]) -> IResult<&[u8], Document<'_>> {
    let (input, heading) = opt(parse_heading)(input)?;
    let (input, records) = parse_records(input)?;
    return Ok((input, Document {
        heading,
        records,
    }));
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn unit() {
        let (input, parsed) = parse_unit(b"test_unit\x1F").unwrap();
        assert_eq!(input, b"\x1F");
        assert_eq!(parsed.0, &b"test_unit"[..]);
        assert!(matches!(parsed, Unit(Cow::Borrowed(_))), "borrowed when no escapes");

        let (input, parsed) = parse_unit(b"test_unit\x1B\x1Frest_of_unit\x1F").unwrap();
        assert_eq!(input, b"\x1F");
        assert_eq!(parsed.0, &b"test_unit\x1Frest_of_unit"[..]);
        assert!(matches!(parsed, Unit(Cow::Owned(_))), "owned when escapes");
    }

    #[test]
    fn units() {
        let (input, parsed) = parse_units(b"test_unit\x1Fsecond_test_unit\x1Fowned\x1B\x1Btext\x02").unwrap();
        assert_eq!(input, b"\x02");
        assert_eq!(parsed.units.len(), 3);
        assert_eq!(parsed.units[0].0, &b"test_unit"[..]);
        assert!(matches!(parsed.units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed.units[1].0, &b"second_test_unit"[..]);
        assert!(matches!(parsed.units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed.units[2].0, &b"owned\x1Btext"[..]);
        assert!(matches!(parsed.units[2].0, Cow::Owned(_)));
    }

    #[test]
    fn heading() {
        let (input, parsed) = parse_heading(b"\x01test_unit\x1Fsecond_test_unit\x1Fowned\x1B\x1Btext\x02").unwrap();
        assert_eq!(input, b"\x02");
        assert_eq!(parsed.0.units.len(), 3);
        assert_eq!(parsed.0.units[0].0, &b"test_unit"[..]);
        assert!(matches!(parsed.0.units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed.0.units[1].0, &b"second_test_unit"[..]);
        assert!(matches!(parsed.0.units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed.0.units[2].0, &b"owned\x1Btext"[..]);
        assert!(matches!(parsed.0.units[2].0, Cow::Owned(_)));
    }

    #[test]
    fn records() {
        let (input, parsed) = parse_records(b"\x02test_unit\x1Fsecond_test_unit\x1Fowned\x1B\x1Btext\x1Erecord_2_unit\x1F2_second_test_unit\x1Fowned\x1B\x1Btext_2\x03").unwrap();
        assert_eq!(input, b"");
        assert_eq!(parsed.len(), 2);
        assert_eq!(parsed[0].units.len(), 3);
        assert_eq!(parsed[0].units[0].0, &b"test_unit"[..]);
        assert!(matches!(parsed[0].units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed[0].units[1].0, &b"second_test_unit"[..]);
        assert!(matches!(parsed[0].units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed[0].units[2].0, &b"owned\x1Btext"[..]);
        assert!(matches!(parsed[0].units[2].0, Cow::Owned(_)));
        assert_eq!(parsed[1].units.len(), 3);
        assert_eq!(parsed[1].units[0].0, &b"record_2_unit"[..]);
        assert!(matches!(parsed[1].units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed[1].units[1].0, &b"2_second_test_unit"[..]);
        assert!(matches!(parsed[1].units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed[1].units[2].0, &b"owned\x1Btext_2"[..]);
        assert!(matches!(parsed[1].units[2].0, Cow::Owned(_)));
    }

    #[test]
    fn document_headingless() {
        let (input, parsed) = parse_document(b"\x02test_unit\x1Fsecond_test_unit\x1Fowned\x1B\x1Btext\x1Erecord_2_unit\x1F2_second_test_unit\x1Fowned\x1B\x1Btext_2\x03").unwrap();
        assert_eq!(input, b"");
        assert!(matches!(parsed.heading, None));
        assert_eq!(parsed.records.len(), 2);
        assert_eq!(parsed.records[0].units.len(), 3);
        assert_eq!(parsed.records[0].units[0].0, &b"test_unit"[..]);
        assert!(matches!(parsed.records[0].units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[0].units[1].0, &b"second_test_unit"[..]);
        assert!(matches!(parsed.records[0].units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[0].units[2].0, &b"owned\x1Btext"[..]);
        assert!(matches!(parsed.records[0].units[2].0, Cow::Owned(_)));
        assert_eq!(parsed.records[1].units.len(), 3);
        assert_eq!(parsed.records[1].units[0].0, &b"record_2_unit"[..]);
        assert!(matches!(parsed.records[1].units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[1].units[1].0, &b"2_second_test_unit"[..]);
        assert!(matches!(parsed.records[1].units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[1].units[2].0, &b"owned\x1Btext_2"[..]);
        assert!(matches!(parsed.records[1].units[2].0, Cow::Owned(_)));
    }

    #[test]
    fn document() {
        let (input, parsed) = parse_document(b"\x01alpha\x1Fbeta\x1Fgamma\x02test_unit\x1Fsecond_test_unit\x1Fowned\x1B\x1Btext\x1Erecord_2_unit\x1F\x1F\x03").unwrap();
        assert_eq!(input, b"");
        let heading = parsed.heading.unwrap();
        assert!(matches!(heading.0.units[0], Unit(Cow::Borrowed(b"alpha"))));
        assert!(matches!(heading.0.units[1], Unit(Cow::Borrowed(b"beta"))));
        assert!(matches!(heading.0.units[2], Unit(Cow::Borrowed(b"gamma"))));
        assert_eq!(parsed.records.len(), 2);
        assert_eq!(parsed.records[0].units.len(), 3);
        assert_eq!(parsed.records[0].units[0].0, &b"test_unit"[..]);
        assert!(matches!(parsed.records[0].units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[0].units[1].0, &b"second_test_unit"[..]);
        assert!(matches!(parsed.records[0].units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[0].units[2].0, &b"owned\x1Btext"[..]);
        assert!(matches!(parsed.records[0].units[2].0, Cow::Owned(_)));
        assert_eq!(parsed.records[1].units.len(), 3);
        assert_eq!(parsed.records[1].units[0].0, &b"record_2_unit"[..]);
        assert!(matches!(parsed.records[1].units[0].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[1].units[1].0, &b""[..]);
        assert!(matches!(parsed.records[1].units[1].0, Cow::Borrowed(_)));
        assert_eq!(parsed.records[1].units[2].0, &b""[..]);
        assert!(matches!(parsed.records[1].units[2].0, Cow::Borrowed(_)));
    }
}