1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/// A string splitter splitting on a specific char unless it's
///  part of a substring starting and ending with a quote (`"`).
/// Quotes around a substring are removed.
pub struct SplitUnquotedChar<'s> {
    src: &'s str,
    unwrap_quotes: bool,
    delimitor: char,
}

impl<'s> SplitUnquotedChar<'s> {
    pub fn new(src: &'s str, delimitor: char) -> Self {
        Self {
            src,
            unwrap_quotes: false,
            delimitor,
        }
    }
    /// set whether token starting and ending qui a quote
    /// should have them removed
    pub fn unwrap_quotes(&self, b: bool) -> Self {
        Self {
            src: self.src,
            unwrap_quotes: b,
            delimitor: self.delimitor,
        }
    }
}

impl<'s> Iterator for SplitUnquotedChar<'s> {
    type Item = &'s str;

    fn next(&mut self) -> Option<&'s str> {
        // we ignore spaces at the start
        self.src = self.src.trim_start();
        let mut char_indices = self.src.char_indices();
        if let Some((_, c0)) = char_indices.next() {
            let mut previous = c0;
            for (bi, c) in self.src.char_indices() {
                if c == self.delimitor {
                    if c0 == '"' {
                        if bi == 1 || previous != '"' {
                            previous = c;
                            continue;
                        }
                        // the first and last quotes aren't part of the
                        // returned token
                        let token = if self.unwrap_quotes {
                            &self.src[1..bi - 1]
                        } else {
                            self.src
                        };
                        self.src = &self.src[bi..];
                        return Some(token);
                    }
                    let token = &self.src[..bi];
                    self.src = &self.src[bi..];
                    return Some(token);
                }
                previous = c;
            }
            let unwrap = self.unwrap_quotes && c0 == '"' && previous == '"' && self.src.len() > 1;
            let token = if unwrap {
                &self.src[1..self.src.len() - 1]
            } else {
                self.src
            };
            self.src = &self.src[0..0];
            Some(token)
        } else {
            None
        }
    }
}

/// return a new iterator of the the whitespace separated tokens
/// of the given string, taking quotes into account
pub fn split_unquoted_whitespace<'s>(src: &'s str) -> SplitUnquotedChar<'s> {
    SplitUnquotedChar::new(src, ' ')
}

/// return a new iterator of the the `delimitor` separated tokens
/// of the given string, taking quotes into account
pub fn split_unquoted_char<'s>(src: &'s str, delimitor: char) -> SplitUnquotedChar<'s> {
    SplitUnquotedChar::new(src, delimitor)
}

#[cfg(test)]
mod split_unquoted_whitespace_test {

    use super::*;

    macro_rules! t {
        ($src:literal -> [$($token:literal),* $(,)?]) => {
            let mut split = SplitUnquotedChar::new($src, ' ')
                .unwrap_quotes(true);
            $(
                assert_eq!(split.next(), Some($token));
            )*
            assert_eq!(split.next(), None);
        }
    }

    #[test]
    fn test_split_unquoted_whitespace() {
        t!("" -> []);
        t!("    " -> []);
        t!(" a    试bc d  " -> ["a", "试bc", "d"]);
        t!("e^iπ^ = 1" -> ["e^iπ^", "=", "1"]);
        t!("1234" -> ["1234"]);
        t!("1234\"" -> ["1234\""]);
        t!(r#"""# -> [r#"""#]);
        t!(r#""a""# -> [r#"a"#]);
        t!(r#" " "# -> [r#"" "#]);
        t!(r#"a  "deux mots" b"# -> ["a", "deux mots", "b"]);
        t!(r#" " ""# -> [" "]);
        t!(r#" a  "2 * 试" x"x "z "# -> ["a", "2 * 试", "x\"x", "\"z "]);
        t!(r#"""""# -> ["\""]);
        t!(r#""""""# -> ["\"\""]);
    }
}