shi 0.1.5 - Docs.rs

pub struct Tokenization<'a> {
    pub tokens: Vec<&'a str>,
    pub trailing_space: bool,
}

/// Tokenizers pre-process the string into a vector of &str tokens for a parser. These tokens are
/// essentially a way to split apart a line into command and arguments. Effectively a tokenizer,
/// but it doesn't necessarily emit a variety of tokens, but serves a purpose similar to a
/// tokenizer, or I suppose, at least a scanner?
pub trait Tokenizer {
    // Tokenize returns a vector of tokens (&str), and a bool to indicate if there was a trailing
    // space.
    fn tokenize<'a>(&self, line: &'a str) -> Tokenization<'a>;
}

/// DefaultTokenizer tokenizes an input string into tokens based on some default, basic rules.
///
/// Handles things like splitting by space, acknowledging quotation marks, etc.
pub struct DefaultTokenizer {
    quotations: Vec<char>,
}

#[derive(Debug, PartialEq)]
/// Describes the position of a quotation mark.
///
/// Quotation marks are generally either `"` or `'`, but can be any character.
struct QuoteLoc {
    pos: usize,
    quotation: char,
}

#[derive(Debug)]
/// Describes a pair of quotes.
///
/// Quotation marks are generally either `"` or `'`, but can be any character.
struct QuotePair {
    start: usize,
    end: usize,
    quotation: char,
}

#[derive(Debug, PartialEq)]
/// Describes a 'blob' of the input string.
///
/// A 'blob' can be thought of as a chunk or portion of the string. It can be defined as quoted
/// chunks of the string, or non-quoted chunks, with no other cases. Blobs are contiguous and thus
/// do not overlap.
enum Blob<'a> {
    Normal(&'a str),
    Quoted(&'a str),
}

/// Some shorthand functions for constructing Blobs.
#[cfg(test)]
impl<'a> Blob<'a> {
    /// Constructs a Normal blob.
    fn n(s: &str) -> Blob {
        Blob::Normal(s)
    }

    /// Constructs a Quoted blob.
    fn q(s: &str) -> Blob {
        Blob::Quoted(s)
    }
}

impl DefaultTokenizer {
    /// Constructs a `DefaultTokenizer`.
    pub fn new(quotations: Vec<char>) -> DefaultTokenizer {
        DefaultTokenizer { quotations }
    }

    /// Finds quotes in the line string, and returns them.
    ///
    /// This method does not have any intelligence around pairing of quotation marks, it simply
    /// finds and returns the ones it sees.
    ///
    /// # Arguments
    /// `line` - The input line.
    ///
    /// # Returns
    /// `Vec<QuoteLoc>` - A listing of all the quotation marks. Pairs represent two elements in
    /// this listing.
    fn find_quotes(&self, line: &str) -> Vec<QuoteLoc> {
        let mut quote_locs: Vec<QuoteLoc> = Vec::new();

        for (i, ch) in line.char_indices() {
            if self.quotations.contains(&ch) {
                quote_locs.push(QuoteLoc {
                    pos: i,
                    quotation: ch,
                })
            }
        }

        quote_locs
    }

    /// Finds pairings of balanced quotes in the string, given a series of quote locations.
    ///
    /// This method is the intelligent sibling of `find_quotes()`. It takes the `QuoteLoc`'s
    /// returned by `find_quotes()` and pairs together the `QuoteLoc`'s into `QuotePair`'s.
    ///
    /// # Arguments
    /// `quote_locs` - The quote locations as returned by `find_quotes()`.
    ///
    /// # Returns
    /// `Vec<QuotePair>` - The paired couples of quotes based on the given quote locations.
    fn find_quote_pairs(&self, quote_locs: Vec<QuoteLoc>) -> Vec<QuotePair> {
        let mut quote_pairs: Vec<QuotePair> = Vec::new();
        let mut start_idx = 0;
        let mut next_idx = None;

        // The algorithm here is that we will go through each of the quote locations, and for each
        // of them, we will iterate the rest of the quote locations until we find a matching
        // quotation character, upon which we will discard any quotations in between (since they
        // are actually contained within the outer quotes), and add this pair.
        //
        // Then beginning from after the second QuoteLoc of the pair, we repeat until we've
        // exhausted all the QuoteLocs.
        while start_idx < quote_locs.len() {
            // This .unwrap() is safe, because of the while condition.
            let start = quote_locs.get(start_idx).unwrap();
            for i in start_idx + 1..quote_locs.len() {
                // This .unwrap() is safe, because of the for loops range being upper bounded by
                // quote_locs.len() exclusively. For the lower bound, we know that start_idx+1 is
                // within bounds, because of the outer while condition. If adding 1 brings it to
                // quote_locs.len(), that would exceed the for range and this code would not be
                // executed.
                let current = quote_locs.get(i).unwrap();

                if current.quotation == start.quotation {
                    quote_pairs.push(QuotePair {
                        start: start.pos,
                        end: current.pos,
                        quotation: current.quotation,
                    });
                    next_idx = Some(i + 1);
                    break;
                }

                if next_idx.is_none() {
                    next_idx = Some(i)
                }
            }

            if let Some(idx) = next_idx {
                start_idx = idx;
            } else {
                break;
            }
            next_idx = None;
        }

        quote_pairs
    }

    /// Creates blobs from the original line based on the given quote pairs.
    ///
    /// This function essentially breaks apart the line into quoted and non-quoted pieces.
    ///
    /// # Arguments
    /// `line` - The input line.
    /// `pairs` - The listing of quote pairs.
    ///
    /// # Returns
    /// `Vec<Blob>` - The listing of quoted & non-quoted blobs of the input line.
    fn construct_slices_from_pairs<'a>(
        &self,
        line: &'a str,
        pairs: Vec<QuotePair>,
    ) -> Vec<Blob<'a>> {
        let mut blobs: Vec<Blob> = Vec::new();

        let mut cur = 0;
        // Now we have the pairs. Get the slices.
        for pair in pairs.iter() {
            // If the current position does not match the pair.start, that means that the region of
            // the input from cur to pair.start is itself a blob, and it's unquoted. Let's make
            // sure we don't forget that.
            if cur != pair.start {
                blobs.push(Blob::Normal(&line[cur..pair.start]));
            }

            // Of course, the quote pair describes a blob by its region in the line.
            blobs.push(Blob::Quoted(&line[pair.start + 1..pair.end]));
            cur = pair.end + 1;
        }

        // If a quote pair does not end at the end of a line (aka, the second quotation character
        // in the pair is not the last character of the line), then that means there is an extra
        // unquoted blob at the end of the line that we forgot about. Let's remember that here.
        if let Some(quote_pair) = pairs.last() {
            if quote_pair.end + 1 != line.len() {
                blobs.push(Blob::Normal(&line[quote_pair.end + 1..]));
            }
        }

        blobs
    }

    /// Globs together parts of the string that are surrounded by quotation marks, and returns a
    /// series of blobs of the input line based on it.
    ///
    /// In practice for shi, this refers to ASCII " and ', but it is written generally for any set
    /// of quotation characters.
    ///
    /// # Arguments
    /// `line` - The input line.
    ///
    /// # Returns
    /// `Vec<Blob>` - The listing of quoted & non-quoted blobs of the input line.
    fn split_into_quote_blobs<'a>(&self, line: &'a str) -> Vec<Blob<'a>> {
        // This is not a particularly fast algorithm. But it doesn't need to be. Instead, we opt
        // for clarity.

        // First, identify where all the quotes are.
        let quote_locs = self.find_quotes(line);

        // Now, go through those quote locations and pair them accordingly.
        let quote_pairs = self.find_quote_pairs(quote_locs);

        // If no quotes matched, then just pretend we don't care (because we don't).
        if quote_pairs.is_empty() {
            return vec![Blob::Normal(line)];
        }

        // Finally, use the pair ranges to construct the individual slices.
        self.construct_slices_from_pairs(line, quote_pairs)
    }

    /// Splits the given blobs by spaces, and returns the flattened vector of splits.
    ///
    /// The key thing to note here is that a _quoted_ blob is not split, and maintained.
    /// Whereas non-quoted blobs are split by space.
    ///
    /// # Arguments
    /// `line_blobs` - The blobs of an input line.
    ///
    /// # Returns
    /// `Vec<&str>` - A series of slices into an input line that represent its component tokens.
    fn split_by_space<'a>(&self, line_blobs: Vec<Blob<'a>>) -> Vec<&'a str> {
        let mut splitted_parts: Vec<&str> = Vec::new();
        for blob in line_blobs {
            match blob {
                Blob::Normal(s) => {
                    // Since this is not protected by surrounding quotes, we _do_ want to split
                    // this. We simply do it by space, and iterate the split result, adding them
                    // onto splitted parts. extend() helps us do this elegantly.
                    // Small note though: we don't want to add empty strings, since they are
                    // meaningless and are likely just the result of trailing/leading whitespace.
                    splitted_parts.extend(s.split(' ').filter(|s| !s.is_empty()));
                }
                Blob::Quoted(s) => {
                    // We don't want to split inside the quote, so just add this immediately.
                    splitted_parts.push(s);
                }
            }
        }

        splitted_parts
    }
}

#[cfg(test)]
#[test]
fn test_find_quotes() {
    let tokenizer = DefaultTokenizer::new(vec!['\'']);
    let quote_locs = tokenizer.find_quotes("hello 'how are' you?");
    assert_eq!(
        quote_locs,
        vec![
            QuoteLoc {
                pos: 6,
                quotation: '\''
            },
            QuoteLoc {
                pos: 14,
                quotation: '\''
            }
        ]
    );
}

impl Tokenizer for DefaultTokenizer {
    /// Tokenizes the given input line into its constituent components.
    ///
    /// In particular, this preserves quoted strings and does not split inside of them, but
    /// outside, splits them, by space.
    ///
    /// # Arguments
    /// `line` - The input line.
    ///
    /// # Returns
    /// `Vec<&str>` - A series of slices into an input line that represent its component tokens.
    fn tokenize<'a>(&self, line: &'a str) -> Tokenization<'a> {
        let line_bits_with_quotes_globbed = self.split_into_quote_blobs(line);

        Tokenization {
            tokens: self.split_by_space(line_bits_with_quotes_globbed),
            trailing_space: line.ends_with(' '),
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    // Since we test the two functions that comprise this individually, and the implementation of
    // this function is just a composition, most of the coverage is already handled.
    // So, we give one big and complex case.
    #[test]
    fn tokenize() {
        use pretty_assertions::assert_eq;
        let tokenizer = DefaultTokenizer::new(vec!['"', '\'', '|', '-']);
        assert_eq!(
            tokenizer.tokenize(
                "bar 'foo is here' and quux is not\n necessarily 'here'\" b\"ut you co|uld say 'there'-",
            ).tokens,
            vec![
                "bar",
                "foo is here",
                "and",
                "quux",
                "is",
                "not\n",
                "necessarily",
                "here",
                " b",
                "ut",
                "you",
                "co|uld",
                "say",
                "there",
                "-",
            ]
        )
    }

    mod glob_quotes {
        use super::*;
        use pretty_assertions::assert_eq;

        #[test]
        fn basic_single() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("foo 'hi there!' btw hello"),
                vec![Blob::n("foo "), Blob::q("hi there!"), Blob::n(" btw hello")]
            );
        }

        #[test]
        fn basic_double() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("foo \"hi there!\" btw hello"),
                vec![Blob::n("foo "), Blob::q("hi there!"), Blob::n(" btw hello")]
            );
        }

        #[test]
        fn no_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("foo hi there! btw hello"),
                vec![Blob::n("foo hi there! btw hello")]
            );
        }

        #[test]
        fn quote_at_left() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("'foo hi' there! btw hello"),
                vec![Blob::q("foo hi"), Blob::n(" there! btw hello")]
            );
        }

        #[test]
        fn quote_at_right() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("there! btw hello 'foo hi'"),
                vec![Blob::n("there! btw hello "), Blob::q("foo hi")]
            );
        }

        #[test]
        fn single_dangling() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("there! btw hello 'foo hi"),
                vec![Blob::n("there! btw hello 'foo hi")]
            );
        }

        #[test]
        fn multiple_dangling() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'', '|']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("abc'defghijklmnopq\"rstuvwxyz|vvvv|v"),
                vec![
                    Blob::n("abc'defghijklmnopq\"rstuvwxyz"),
                    Blob::q("vvvv"),
                    Blob::n("v")
                ]
            );
        }

        #[test]
        fn one_success_amongst_dangling() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'', '|', '-', '.']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("abc'defghi.jklmnopq\"rstu\"vwx-yz|vvvv|v"),
                vec![
                    Blob::n("abc'defghi.jklmnopq"),
                    Blob::q("rstu"),
                    Blob::n("vwx-yz"),
                    Blob::q("vvvv"),
                    Blob::n("v")
                ]
            );
        }

        #[test]
        fn dangling_inside_matched_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("there! btw hello 'foo\" hi'"),
                vec![Blob::n("there! btw hello "), Blob::q("foo\" hi")]
            );
        }

        #[test]
        fn dangling_after_matched_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("there! btw 'foo hi' \" hello"),
                vec![
                    Blob::n("there! btw "),
                    Blob::q("foo hi"),
                    Blob::n(" \" hello")
                ]
            );
        }

        #[test]
        fn dangling_before_matched_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("there!\" btw 'foo hi' hello"),
                vec![
                    Blob::n("there!\" btw "),
                    Blob::q("foo hi"),
                    Blob::n(" hello")
                ]
            );
        }

        #[test]
        fn dangling_at_start() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("'there! btw foo hi hello"),
                vec![Blob::n("'there! btw foo hi hello")]
            );
        }

        #[test]
        fn dangling_at_end() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("there! btw foo hi hello'"),
                vec![Blob::n("there! btw foo hi hello'")]
            );
        }

        #[test]
        fn dangling_at_start_with_pair() {
            let tokenizer = DefaultTokenizer::new(vec!['|', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("'there! btw |foo |hi hello"),
                vec![
                    Blob::n("'there! btw "),
                    Blob::q("foo "),
                    Blob::n("hi hello")
                ]
            );
        }

        #[test]
        fn dangling_at_end_with_pair() {
            let tokenizer = DefaultTokenizer::new(vec!['|', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("there! btw |foo |hi hello'"),
                vec![
                    Blob::n("there! btw "),
                    Blob::q("foo "),
                    Blob::n("hi hello'")
                ]
            );
        }

        #[test]
        fn multiple_non_overlapping_pairs() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("abc'defg'hijk'lmno'pqr'stuvwx'yz"),
                vec![
                    Blob::n("abc"),
                    Blob::q("defg"),
                    Blob::n("hijk"),
                    Blob::q("lmno"),
                    Blob::n("pqr"),
                    Blob::q("stuvwx"),
                    Blob::n("yz")
                ]
            );
        }

        #[test]
        fn many_kinds_of_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'', '|']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("abc'defg'hijk'lmno'pqr'stuvwx'yz|vvvv|v"),
                vec![
                    Blob::n("abc"),
                    Blob::q("defg"),
                    Blob::n("hijk"),
                    Blob::q("lmno"),
                    Blob::n("pqr"),
                    Blob::q("stuvwx"),
                    Blob::n("yz"),
                    Blob::q("vvvv"),
                    Blob::n("v")
                ]
            );
        }

        #[test]
        fn only_one_quote() {
            let tokenizer = DefaultTokenizer::new(vec!['|']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("abc'defg'hijk'lmno'pqr'stuvwx'yz|vvvv|v"),
                vec![
                    Blob::n("abc'defg'hijk'lmno'pqr'stuvwx'yz"),
                    Blob::q("vvvv"),
                    Blob::n("v")
                ]
            );
        }

        #[test]
        fn multiple_kinds_of_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("abc'defg'hijklmnopqr\"stuvwx\"yz"),
                vec![
                    Blob::n("abc"),
                    Blob::q("defg"),
                    Blob::n("hijklmnopqr"),
                    Blob::q("stuvwx"),
                    Blob::n("yz")
                ]
            );
        }

        #[test]
        fn empty_string() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(tokenizer.split_into_quote_blobs(""), vec![Blob::n("")]);
        }

        #[test]
        fn only_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("''''''''"),
                // There are 4 pairs of single quotes above, so 4 blobs:
                vec![Blob::q(""), Blob::q(""), Blob::q(""), Blob::q("")]
            );
        }

        #[test]
        fn mixture_of_only_quotes() {
            let tokenizer = DefaultTokenizer::new(vec!['|', '\'']);
            assert_eq!(
                tokenizer.split_into_quote_blobs("''||'|''|'||''|'|"),
                vec![
                    Blob::q(""),
                    Blob::q(""),
                    Blob::q("|"),
                    Blob::q("|"),
                    Blob::q(""),
                    Blob::q(""),
                    Blob::q("'")
                ]
            );
        }
    }

    mod split_by_space {
        use super::*;

        #[test]
        fn empty_string() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            let empty_vec: Vec<&str> = Vec::new();
            assert_eq!(tokenizer.split_by_space(vec![]), empty_vec);
        }

        #[test]
        fn empty_blob() {
            // I don't think this is actually ever possible if we take blobs from
            // split_into_quote_blobs(), but whatever.
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            // We expect us to not include the empty string, since the tokenizer considers it
            // useless.
            let empty_vec: Vec<&str> = Vec::new();
            assert_eq!(tokenizer.split_by_space(vec![Blob::n("")]), empty_vec);
        }

        #[test]
        fn multiple_empty_blobs() {
            // Ditto comments in the empty_blob() test.
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            let empty_vec: Vec<&str> = Vec::new();
            assert_eq!(
                tokenizer.split_by_space(vec![Blob::n(""), Blob::n(""), Blob::n("")]),
                empty_vec
            );
        }

        #[test]
        fn only_normals() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![Blob::n("hi there"), Blob::n("euler is cool")]),
                vec!["hi", "there", "euler", "is", "cool"]
            );
        }

        #[test]
        fn only_quoteds() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![Blob::q("hi there"), Blob::q("euler is cool")]),
                vec!["hi there", "euler is cool"]
            );
        }

        #[test]
        fn quoted_then_normal() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![Blob::q("hi there!"), Blob::n("euler is cool")]),
                vec!["hi there!", "euler", "is", "cool"]
            );
        }

        #[test]
        fn normal_then_quoted() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![Blob::n("euler is cool"), Blob::q("hi there!")]),
                vec!["euler", "is", "cool", "hi there!"]
            );
        }

        #[test]
        fn quoted_surrounded_by_normals() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![
                    Blob::n("euler is cool"),
                    Blob::q("hi there!"),
                    Blob::n("euler is cool")
                ]),
                vec!["euler", "is", "cool", "hi there!", "euler", "is", "cool"]
            );
        }

        #[test]
        fn normal_surrounded_by_quoteds() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![
                    Blob::q("hi there!"),
                    Blob::n("euler is cool"),
                    Blob::q("hi there!")
                ]),
                vec!["hi there!", "euler", "is", "cool", "hi there!"]
            );
        }

        #[test]
        fn trailing_spaces() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![
                    Blob::q("hi there!"),
                    Blob::n("euler is cool "),
                    Blob::q("hi there!")
                ]),
                vec!["hi there!", "euler", "is", "cool", "hi there!"]
            );
        }

        #[test]
        fn with_newline() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![
                    Blob::q("hi there!"),
                    // We expect the newline to not be used as a splitting term.
                    Blob::n("euler is\ncool "),
                    Blob::q("hi there!")
                ]),
                vec!["hi there!", "euler", "is\ncool", "hi there!"]
            );
        }

        #[test]
        fn with_tab() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![
                    Blob::q("hi there!"),
                    // We expect the tab to not be used as a splitting term.
                    Blob::n("euler is\tcool "),
                    Blob::q("hi there!")
                ]),
                vec!["hi there!", "euler", "is\tcool", "hi there!"]
            );
        }

        #[test]
        fn multiple_spaces() {
            let tokenizer = DefaultTokenizer::new(vec!['"', '\'']);
            assert_eq!(
                tokenizer.split_by_space(vec![
                    Blob::q("hi there!"),
                    // We expect the tab to not be used as a splitting term.
                    Blob::n("euler    is   cool "),
                    Blob::q("hi   there!")
                ]),
                vec!["hi there!", "euler", "is", "cool", "hi   there!"]
            );
        }
    }
}