gix-config 0.55.0

A git-config file parser and editor from the gitoxide project
Documentation
use std::borrow::Cow;

use bstr::{BStr, ByteSlice};
use winnow::{
    combinator::{alt, delimited, opt, preceded, repeat},
    error::{ErrMode, InputError as NomError, ParserError as _},
    prelude::*,
    stream::Offset as _,
    token::{one_of, take_till, take_while},
};

use crate::parse::{error::ParseNode, section, Comment, Error, Event};

/// Attempt to zero-copy parse the provided bytes, passing results to `dispatch`.
pub fn from_bytes<'i>(mut input: &'i [u8], dispatch: &mut dyn FnMut(Event<'i>)) -> Result<(), Error> {
    let start = input.checkpoint();

    let bom = unicode_bom::Bom::from(input);
    input.next_slice(bom.len());

    repeat(
        0..,
        alt((
            comment.map(Event::Comment),
            take_spaces1.map(|whitespace| Event::Whitespace(Cow::Borrowed(whitespace))),
            |i: &mut &'i [u8]| {
                let newline = take_newlines1.parse_next(i)?;
                let o = Event::Newline(Cow::Borrowed(newline));
                Ok(o)
            },
        )),
    )
    .fold(|| (), |_acc, event| dispatch(event))
    .parse_next(&mut input)
    // I don't think this can panic. many0 errors if the child parser returns
    // a success where the input was not consumed, but alt will only return Ok
    // if one of its children succeed. However, all of it's children are
    // guaranteed to consume something if they succeed, so the Ok(i) == i case
    // can never occur.
    .expect("many0(alt(...)) panicked. Likely a bug in one of the children parsers.");

    if input.is_empty() {
        return Ok(());
    }

    let mut node = ParseNode::SectionHeader;

    let res = repeat(1.., |i: &mut &'i [u8]| section(i, &mut node, dispatch))
        .map(|()| ())
        .parse_next(&mut input);
    res.map_err(|_| {
        let newlines = newlines_from(input, start);
        Error {
            line_number: newlines,
            last_attempted_parser: node,
            parsed_until: input.as_bstr().into(),
        }
    })?;

    // This needs to happen after we collect sections, otherwise the line number
    // will be off.
    if !input.is_empty() {
        let newlines = newlines_from(input, start);
        return Err(Error {
            line_number: newlines,
            last_attempted_parser: node,
            parsed_until: input.as_bstr().into(),
        });
    }

    Ok(())
}

fn newlines_from(input: &[u8], start: winnow::stream::Checkpoint<&[u8], &[u8]>) -> usize {
    let offset = input.offset_from(&start);
    let mut start_input = input;
    start_input.reset(&start);
    start_input.next_slice(offset).iter().filter(|c| **c == b'\n').count()
}

fn comment<'i>(i: &mut &'i [u8]) -> ModalResult<Comment<'i>, NomError<&'i [u8]>> {
    (
        one_of([';', '#']),
        take_till(0.., |c| c == b'\n').map(|text: &[u8]| Cow::Borrowed(text.as_bstr())),
    )
        .map(|(tag, text)| Comment { tag, text })
        .parse_next(i)
}

#[cfg(test)]
mod tests;

fn section<'i>(
    i: &mut &'i [u8],
    node: &mut ParseNode,
    dispatch: &mut dyn FnMut(Event<'i>),
) -> ModalResult<(), NomError<&'i [u8]>> {
    let start = i.checkpoint();
    let header = section_header(i).inspect_err(|_err| {
        i.reset(&start);
    })?;
    dispatch(Event::SectionHeader(header));

    // This would usually be a many0(alt(...)), the manual loop allows us to
    // optimize vec insertions
    loop {
        let start = i.checkpoint();

        if let Some(v) = opt(take_spaces1).parse_next(i)? {
            dispatch(Event::Whitespace(Cow::Borrowed(v.as_bstr())));
        }

        if let Some(v) = opt(take_newlines1).parse_next(i)? {
            dispatch(Event::Newline(Cow::Borrowed(v.as_bstr())));
        }

        key_value_pair(i, node, dispatch)?;

        if let Some(comment) = opt(comment).parse_next(i)? {
            dispatch(Event::Comment(comment));
        }

        if i.offset_from(&start) == 0 {
            break;
        }
    }

    Ok(())
}

fn section_header<'i>(i: &mut &'i [u8]) -> ModalResult<section::Header<'i>, NomError<&'i [u8]>> {
    // No spaces must be between section name and section start
    let name = preceded('[', take_while(1.., is_section_char).map(bstr::ByteSlice::as_bstr)).parse_next(i)?;

    if opt(one_of::<_, _, ErrMode<NomError<&[u8]>>>(']'))
        .parse_next(i)?
        .is_some()
    {
        // Either section does not have a subsection or using deprecated
        // subsection syntax at this point.
        let header = match memchr::memrchr(b'.', name.as_bytes()) {
            Some(index) => section::Header {
                name: section::Name(Cow::Borrowed(name[..index].as_bstr())),
                separator: name.get(index..=index).map(|s| Cow::Borrowed(s.as_bstr())),
                subsection_name: name.get(index + 1..).map(|s| Cow::Borrowed(s.as_bstr())),
            },
            None => section::Header {
                name: section::Name(Cow::Borrowed(name.as_bstr())),
                separator: None,
                subsection_name: None,
            },
        };

        if header.name.is_empty() {
            return Err(winnow::error::ErrMode::from_input(i));
        }
        return Ok(header);
    }

    // Section header must be using modern subsection syntax at this point.
    (take_spaces1, delimited('"', opt(sub_section), "\"]"))
        .map(|(whitespace, subsection_name)| section::Header {
            name: section::Name(Cow::Borrowed(name)),
            separator: Some(Cow::Borrowed(whitespace)),
            subsection_name,
        })
        .parse_next(i)
}

fn is_section_char(c: u8) -> bool {
    c.is_ascii_alphanumeric() || c == b'-' || c == b'.'
}

fn sub_section<'i>(i: &mut &'i [u8]) -> ModalResult<Cow<'i, BStr>, NomError<&'i [u8]>> {
    let mut output = Cow::Borrowed(Default::default());
    if let Some(sub) = opt(subsection_subset).parse_next(i)? {
        output = Cow::Borrowed(sub.as_bstr());
    }
    while let Some(sub) = opt(subsection_subset).parse_next(i)? {
        output.to_mut().extend(sub);
    }

    Ok(output)
}

fn subsection_subset<'i>(i: &mut &'i [u8]) -> ModalResult<&'i [u8], NomError<&'i [u8]>> {
    alt((subsection_unescaped, subsection_escaped_char)).parse_next(i)
}

fn subsection_unescaped<'i>(i: &mut &'i [u8]) -> ModalResult<&'i [u8], NomError<&'i [u8]>> {
    take_while(1.., is_subsection_unescaped_char).parse_next(i)
}

fn subsection_escaped_char<'i>(i: &mut &'i [u8]) -> ModalResult<&'i [u8], NomError<&'i [u8]>> {
    preceded('\\', one_of(is_subsection_escapable_char).take()).parse_next(i)
}

fn is_subsection_escapable_char(c: u8) -> bool {
    c != b'\n'
}

fn is_subsection_unescaped_char(c: u8) -> bool {
    c != b'"' && c != b'\\' && c != b'\n' && c != 0
}

fn key_value_pair<'i>(
    i: &mut &'i [u8],
    node: &mut ParseNode,
    dispatch: &mut dyn FnMut(Event<'i>),
) -> ModalResult<(), NomError<&'i [u8]>> {
    *node = ParseNode::Name;
    if let Some(name) = opt(config_name).parse_next(i)? {
        dispatch(Event::SectionValueName(section::ValueName(Cow::Borrowed(name))));

        if let Some(whitespace) = opt(take_spaces1).parse_next(i)? {
            dispatch(Event::Whitespace(Cow::Borrowed(whitespace)));
        }

        *node = ParseNode::Value;
        config_value(i, dispatch)
    } else {
        Ok(())
    }
}

/// Parses the config name of a config pair. Assumes the input has already been
/// trimmed of any leading whitespace.
fn config_name<'i>(i: &mut &'i [u8]) -> ModalResult<&'i BStr, NomError<&'i [u8]>> {
    (
        one_of(|c: u8| c.is_ascii_alphabetic()),
        take_while(0.., |c: u8| c.is_ascii_alphanumeric() || c == b'-'),
    )
        .take()
        .map(bstr::ByteSlice::as_bstr)
        .parse_next(i)
}

fn config_value<'i>(i: &mut &'i [u8], dispatch: &mut dyn FnMut(Event<'i>)) -> ModalResult<(), NomError<&'i [u8]>> {
    if opt('=').parse_next(i)?.is_some() {
        dispatch(Event::KeyValueSeparator);
        if let Some(whitespace) = opt(take_spaces1).parse_next(i)? {
            dispatch(Event::Whitespace(Cow::Borrowed(whitespace)));
        }
        value_impl(i, dispatch)
    } else {
        // This is a special way of denoting 'empty' values which a lot of code depends on.
        // Hence, rather to fix this everywhere else, leave it here and fix it where it matters, namely
        // when it's about differentiating between a missing key-value separator, and one followed by emptiness.
        dispatch(Event::Value(Cow::Borrowed("".into())));
        Ok(())
    }
}

/// Handles parsing of known-to-be values. This function handles both single
/// line values as well as values that are continuations.
fn value_impl<'i>(i: &mut &'i [u8], dispatch: &mut dyn FnMut(Event<'i>)) -> ModalResult<(), NomError<&'i [u8]>> {
    let start_checkpoint = i.checkpoint();
    let mut value_start_checkpoint = i.checkpoint();
    let mut value_end = None;

    // This is required to ignore comment markers if they're in a quote.
    let mut is_in_quotes = false;
    // Used to determine if we return a Value or Value{Not,}Done
    let mut partial_value_found = false;

    loop {
        let _ = take_while(0.., |c| !matches!(c, b'\n' | b'\\' | b'"' | b';' | b'#')).parse_next(i)?;
        if let Some(c) = i.next_token() {
            match c {
                b'\n' => {
                    value_end = Some(i.offset_from(&value_start_checkpoint) - 1);
                    break;
                }
                b';' | b'#' if !is_in_quotes => {
                    value_end = Some(i.offset_from(&value_start_checkpoint) - 1);
                    break;
                }
                b'\\' => {
                    let escaped_index = i.offset_from(&value_start_checkpoint);
                    let escape_index = escaped_index - 1;
                    let Some(mut c) = i.next_token() else {
                        i.reset(&start_checkpoint);
                        return Err(winnow::error::ErrMode::from_input(i));
                    };
                    let mut consumed = 1;
                    if c == b'\r' {
                        c = i.next_token().ok_or_else(|| {
                            i.reset(&start_checkpoint);
                            winnow::error::ErrMode::from_input(i)
                        })?;
                        if c != b'\n' {
                            i.reset(&start_checkpoint);
                            return Err(winnow::error::ErrMode::from_input(i));
                        }
                        consumed += 1;
                    }

                    match c {
                        b'\n' => {
                            partial_value_found = true;

                            i.reset(&value_start_checkpoint);

                            let value = i.next_slice(escape_index).as_bstr();
                            dispatch(Event::ValueNotDone(Cow::Borrowed(value)));

                            i.next_token();

                            let nl = i.next_slice(consumed).as_bstr();
                            dispatch(Event::Newline(Cow::Borrowed(nl)));

                            value_start_checkpoint = i.checkpoint();
                            value_end = None;
                        }
                        b'n' | b't' | b'\\' | b'b' | b'"' => {}
                        _ => {
                            i.reset(&start_checkpoint);
                            return Err(winnow::error::ErrMode::from_input(i));
                        }
                    }
                }
                b'"' => is_in_quotes = !is_in_quotes,
                _ => {}
            }
        } else {
            break;
        }
    }
    if is_in_quotes {
        i.reset(&start_checkpoint);
        return Err(winnow::error::ErrMode::from_input(i));
    }

    let value_end = match value_end {
        None => {
            let last_value_index = i.offset_from(&value_start_checkpoint);
            if last_value_index == 0 {
                dispatch(Event::Value(Cow::Borrowed("".into())));
                return Ok(());
            } else {
                last_value_index
            }
        }
        Some(idx) => idx,
    };

    i.reset(&value_start_checkpoint);
    let value_end_no_trailing_whitespace = i[..value_end]
        .iter()
        .enumerate()
        .rev()
        .find_map(|(idx, b)| (!b.is_ascii_whitespace()).then_some(idx + 1))
        .unwrap_or(0);
    let remainder_value = i.next_slice(value_end_no_trailing_whitespace);

    if partial_value_found {
        dispatch(Event::ValueDone(Cow::Borrowed(remainder_value.as_bstr())));
    } else {
        dispatch(Event::Value(Cow::Borrowed(remainder_value.as_bstr())));
    }

    Ok(())
}

fn take_spaces1<'i>(i: &mut &'i [u8]) -> ModalResult<&'i BStr, NomError<&'i [u8]>> {
    take_while(1.., winnow::stream::AsChar::is_space)
        .map(bstr::ByteSlice::as_bstr)
        .parse_next(i)
}

fn take_newlines1<'i>(i: &mut &'i [u8]) -> ModalResult<&'i BStr, NomError<&'i [u8]>> {
    repeat(1..1024, alt(("\r\n", "\n")))
        .map(|()| ())
        .take()
        .map(bstr::ByteSlice::as_bstr)
        .parse_next(i)
}