bpaf 0.7.7

A simple Command Line Argument Parser with parser combinators
Documentation
use std::ffi::OsString;

/// Preprocessed command line argument
///
/// [`OsString`] in Short/Long correspond to orignal command line item used for errors
#[derive(Debug, Clone, Eq, PartialEq)]
pub(crate) enum Arg {
    /// ambiguity between group of short options and a short option with an argument
    /// `-abc` can be either equivalent of `-a -b -c` or `-a=bc`
    ///
    /// OsString is always valid utf8 here
    Ambiguity(Vec<char>, OsString),

    /// short flag: `-f`
    ///
    /// bool indicates if following item is also part of this Short (created
    Short(char, bool, OsString),

    /// long flag: `--flag`
    ///
    Long(String, bool, OsString),

    /// separate word that can be command, positional or an argument to a flag
    ///
    /// Can start with `-` or `--`, doesn't have to be valid utf8
    ///
    /// `hello`
    Word(OsString),

    /// separate word that goes after `--`, strictly positional
    ///
    /// Can start with `-` or `--`, doesn't have to be valid utf8
    PosWord(OsString),
}

// short flag disambiguations:
//
// Short flags | short arg
// No          | No        | no problem
// Yes         | No        | use flag
// No          | Yes       | use arg
// Yes         | Yes       | ask user?
//
// -a  - just a regular short flag: "-a"
// -abc - assuming there are short flags a, b and c: "-a -b -c", assuming utf8 values AND there's no argument -a
// -abc - assuming there's no -a -b -c: "-a bc"
// -abc - assuming both short a b c AND there's argument -a - need to disambiguate  on a context level
//
// 1. parse argument into ambigous representation that can store both short flags and argument
// 2. collect short flag/arg when entering the subparsre
// 3. when reaching ambi
//
impl std::fmt::Display for Arg {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Arg::Short(s, _, _) => write!(f, "-{}", s),
            Arg::Long(l, _, _) => write!(f, "--{}", l),
            Arg::Ambiguity(_, w) | Arg::Word(w) | Arg::PosWord(w) => {
                write!(f, "{}", w.to_string_lossy())
            }
        }
    }
}

pub(crate) fn push_vec(vec: &mut Vec<Arg>, os: OsString, pos_only: &mut bool) {
    if *pos_only {
        return vec.push(Arg::PosWord(os));
    }

    let val = split_os_argument(&os);
    #[cfg(test)]
    {
        if os.to_str().is_some() {
            let fallback = split_os_argument_fallback(&os);
            assert_eq!(val, fallback, "while parsing {:?}", os);
        }
    }
    match val {
        // -f and -fbar
        Some((ArgType::Short, short, None)) => {
            let mut chars = short.chars();
            let mut prev = chars.next();
            let mut ambig = Vec::new();
            for c in chars {
                if let Some(prev) = std::mem::take(&mut prev) {
                    ambig.push(prev);
                }
                ambig.push(c);
            }
            match prev {
                Some(p) => vec.push(Arg::Short(p, false, os)),
                None => {
                    vec.push(Arg::Ambiguity(ambig, os));
                }
            }
        }
        // -f=a
        Some((ArgType::Short, short, Some(arg))) => {
            assert_eq!(
                short.len(),
                1,
                "short flag with an argument must have only one key"
            );
            let key = short.chars().next().unwrap();
            vec.push(Arg::Short(key, true, os));
            vec.push(arg);
        }
        Some((ArgType::Long, long, None)) => {
            vec.push(Arg::Long(long, false, os));
        }
        Some((ArgType::Long, long, Some(arg))) => {
            vec.push(Arg::Long(long, true, os));
            vec.push(arg);
        }
        _ => {
            *pos_only = os == "--";
            if *pos_only {
                vec.push(Arg::PosWord(os));
            } else {
                vec.push(Arg::Word(os));
            }
        }
    }
}

#[derive(Eq, PartialEq, Debug)]
pub(crate) enum ArgType {
    Short,
    Long,
}

/// split [`OsString`] into argument specific bits
///
/// takes a possibly non-utf8 string looking like "--name=value" and splits it into bits:
/// "--" - type, "name" - name, must be representable as utf8, "=" - optional, "value" - flag
///
/// dashes and equals sign are low codepoint values and - can look for them literally in a string.
/// This probably means not supporting dashes with diacritics, but that's okay
///
/// name must be valid utf8 after conversion and must not include `=`
///
/// argument is optional and can be non valid utf8.
///
/// The idea is to split the [`OsString`] into opaque parts by looking only at the parts simple parts
/// and let stdlib to handle the decoding of those parts.
///
/// performance wise this (at least on unix) works some small number percentage slower than the
/// previous version
///
///
/// Notation -fbar is ambigous and could mean either `-f -b -a -r` or `-f=bar`, resolve it into
/// [`Arg::Ambiguity`] and let subparser disambiguate it later depending on available short flag and
/// arguments
pub(crate) fn split_os_argument(input: &std::ffi::OsStr) -> Option<(ArgType, String, Option<Arg>)> {
    #[cfg(any(unix, windows))]
    {
        // OsString are sequences of smaller smaller elements - bytes in unix and
        // possibly invalid utf16 items on windows
        #[cfg(unix)]
        type Elt = u8;
        #[cfg(windows)]
        type Elt = u16;

        // reuse allocation on unix, don't reuse allocations on windows
        // either case - pack a vector of elements back into OsString
        fn os_from_vec(vec: Vec<Elt>) -> OsString {
            #[cfg(unix)]
            {
                <OsString as std::os::unix::ffi::OsStringExt>::from_vec(vec)
            }
            #[cfg(windows)]
            {
                <OsString as std::os::windows::ffi::OsStringExt>::from_wide(&vec)
            }
        }

        // try to decode elements into a String
        fn str_from_vec(vec: Vec<Elt>) -> Option<String> {
            Some(os_from_vec(vec).to_str()?.to_owned())
        }

        // but in either case dashes and equals are just literal values just with different width
        const DASH: Elt = b'-' as Elt;
        const EQUALS: Elt = b'=' as Elt;

        // preallocate something to store the name. oversized but avoids extra allocations/copying
        let mut name = Vec::with_capacity(input.len());

        let mut items;
        #[cfg(unix)]
        {
            items = std::os::unix::ffi::OsStrExt::as_bytes(input)
                .iter()
                .copied();
        }
        #[cfg(windows)]
        {
            items = std::os::windows::ffi::OsStrExt::encode_wide(input);
        }

        // first item must be dash, otherwise it's positional or a flag value
        if items.next()? != DASH {
            return None;
        }

        // second item may or may not be, but should be present
        let ty;
        match items.next()? {
            DASH => ty = ArgType::Long,
            val => {
                ty = ArgType::Short;
                name.push(val);
            }
        }

        // keep collecting until = or the end of the input
        loop {
            match items.next() {
                Some(EQUALS) => {
                    if ty == ArgType::Short && name.len() > 1 {
                        let mut body = name.drain(1..).collect::<Vec<_>>();
                        body.push(EQUALS);
                        body.extend(items);
                        name.truncate(1);
                        let os = Arg::Word(os_from_vec(body));
                        return Some((ty, str_from_vec(name)?, Some(os)));
                    }
                    break;
                }
                Some(val) => name.push(val),
                None => {
                    if name.is_empty() {
                        return None;
                    }
                    return Some((ty, str_from_vec(name)?, None));
                }
            }
        }

        let name = str_from_vec(name)?;
        let word = {
            let os = os_from_vec(items.collect());
            Arg::Word(os)
        };
        Some((ty, name, Some(word)))
    }
    #[cfg(not(any(unix, windows)))]
    {
        split_os_argument_fallback(input)
    }
}

/// similar to [`split_os_argument`] but only works for utf8 values, used as a fallback function
/// on non windows/unix OSes
#[cfg(any(all(not(windows), not(unix)), test))]
pub(crate) fn split_os_argument_fallback(
    input: &std::ffi::OsStr,
) -> Option<(ArgType, String, Option<Arg>)> {
    // fallback supports only valid utf8 os strings, matches old behavior
    let string = input.to_str()?;

    let mut chars = string.chars();
    let mut name = String::with_capacity(string.len());

    // first character must be dash, otherwise it's positional or a flag value
    if chars.next()? != '-' {
        return None;
    }

    // second character may or may not be
    let ty;
    match chars.next()? {
        '-' => ty = ArgType::Long,
        val => {
            ty = ArgType::Short;
            name.push(val);
        }
    }

    // collect the argument's name up to '=' or until the end
    // if it's a flag
    loop {
        match chars.next() {
            Some('=') => {
                if ty == ArgType::Short && name.len() > 1 {
                    let mut body = name.drain(1..).collect::<String>();
                    body.push('=');
                    body.extend(chars);
                    name.truncate(1);
                    let os = Arg::Word(OsString::from(body));
                    return Some((ty, name, Some(os)));
                }
                break;
            }

            Some(val) => name.push(val),
            None => {
                if name.is_empty() {
                    return None;
                }
                return Some((ty, name, None));
            }
        }
    }

    Some((
        ty,
        name,
        Some(Arg::Word(OsString::from(chars.collect::<String>()))),
    ))
}