1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
//! POSIX Shell Compatible Argument Parser
//!
//! This crate implements POSIX Shell compatible `quote` and `unquote` operations. These allow to
//! quote arbitrary strings so they are not interpreted by a shell if taken as input. In the same
//! way it allows unquoting these strings to get back the original input.
//!
//! The way this quoting works is mostly standardized by POSIX. However, many existing
//! implementations support additional features. These are explicitly not supported by this crate,
//! and it is not the intention of this crate to support these quirks and peculiarities.
//!
//! The basic operations provided are [`quote()`] and [`unquote()`], which both take a UTF-8
//! string as input, and produce the respective output string.
//!
//! # Examples
//!
//! ```
//! let str = "Hello World!";
//!
//! println!("Quoted input: {}", r_shquote::quote(str));
//! ```
//!
//! Unquote operations can fail when the input is not well defined. The returned error contains
//! diagnostics to identify where exactly the parser failed:
//!
//! ```
//! let quote = "'foobar";
//! let res = r_shquote::unquote(quote).unwrap_err();
//!
//! println!("Unquote operation failed: {}", res);
//! ```
//!
//! Combining the quote and unquote operation always produces the original input:
//!
//! ```
//! let str = "foo bar";
//!
//! assert_eq!(str, r_shquote::unquote(&r_shquote::quote(str)).unwrap());
//! ```

/// Error information for unquote operations
///
/// This error contains diagnostics from an unquote-operation. In particular, it contains the
/// character and byte offsets of the cursor where the error originated.
///
/// # Examples
///
/// ```
/// let quote = "'Hello' 'World!";
/// let res = r_shquote::unquote(quote).unwrap_err();
///
/// match res {
///     r_shquote::UnquoteError::UnterminatedSingleQuote { char_cursor: x, .. } |
///     r_shquote::UnquoteError::UnterminatedDoubleQuote { char_cursor: x, .. } => {
///         println!("Input: {}", quote);
///         println!("       {}^--- unterminated quote", " ".repeat(x));
///     },
/// }
/// ```
#[derive(Debug, Clone)]
pub enum UnquoteError {
    UnterminatedSingleQuote {
        char_cursor: usize,
        byte_cursor: usize,
    },
    UnterminatedDoubleQuote {
        char_cursor: usize,
        byte_cursor: usize,
    },
}

impl std::fmt::Display for UnquoteError {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "{:?}", self)
    }
}

impl std::error::Error for UnquoteError { }

/// Quote string
///
/// This takes a string and quotes it according to POSIX Shell rules. The result can be passed to
/// POSIX compatible shells and it will be interpreted as a single token. The [`unquote()`]
/// operation implements the inverse.
///
/// Note that there is no canonical way to quote strings. There are infinite ways to quote a
/// string. This implementation always quotes using sequences of single-quotes. This mimics what a
/// lot of other implementations do. Furthermore, redundant quotes may be added, even thought a
/// shorter output would be possible. This is again done to stay compatible with other existing
/// implementations and make comparisons easier. Nevertheless, a caller must never try to predict
/// the possible escaping and quoting done by this function.
///
/// # Examples
///
/// ```
/// assert_eq!(r_shquote::quote("foobar"), "'foobar'");
/// ```
pub fn quote(source: &str) -> String {
    // This is far from perfect and produces many overly verbose results, for instance:
    //   `'` => `''\'''`
    //   `` => `''`
    //   ...
    // However, this is done purposefully to make the behavior more inline with other
    // implementations, and at the same time keep the implementation simple. If an optimized
    // version is requested, we can always provide alternatives.

    let mut acc = String::with_capacity(source.len() + 2);
    let mut parts = source.split('\'');

    acc.push('\'');

    if let Some(part) = parts.next() {
            acc.push_str(part);
    }

    parts.fold(&mut acc, |acc, part| {
        acc.push_str("\'\\\'\'");
        acc.push_str(part);
        acc
    });

    acc.push('\'');
    acc
}

fn unquote_open_single(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) -> bool {
    // This decodes a single-quote sequence. The opening single-quote was already parsed by
    // the caller. Both `&source[start]` and `cursor` point to the first character following
    // the opening single-quote.
    // Anything inside the single-quote sequence is copied verbatim to the output until the
    // next single-quote. No escape sequences are supported, not even a single-quote can be
    // escaped. However, if the sequence is not terminated, the entire operation is considered
    // invalid.
    for i in cursor {
        match i {
            (_, (_, c)) if c == '\''    => return true,
            (_, (_, c))                 => acc.push(c),
        }
    }

    false
}

fn unquote_open_double(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) -> bool {
    // This decodes a double-quote sequence. The opening double-quote was already parsed by
    // the caller. Both `&source[start]` and `cursor` point to the first character following
    // the opening double-quote.
    // A double-quote sequence allows escape-sequences and goes until the closing
    // double-quote. If the sequence is not terminated, though, the entire operation is
    // considered invalid.
    loop {
        match cursor.next() {
            Some((_, (_, inner_ch))) if inner_ch == '"' => {
                // An unescaped double-quote character terminates the double-quote sequence.
                // It produces no output.
                return true;
            },
            Some((_, (_, inner_ch))) if inner_ch == '\\' => {
                // Inside a double-quote sequence several escape sequences are allowed. In
                // general, any unknown sequence is copied verbatim in its entirety including
                // the backslash. Known sequences produce the escaped character in its output
                // and makes the parser not interpret it. If the sequence is non-terminated,
                // it implies that the double-quote sequence is non-terminated and thus
                // invokes the same behavior, meaning the entire operation is refused.
                match cursor.next() {
                    Some((_, (_, esc_ch))) if esc_ch == '"'  ||
                                              esc_ch == '\\' ||
                                              esc_ch == '`'  ||
                                              esc_ch == '$'  ||
                                              esc_ch == '\n' => {
                        acc.push(esc_ch);
                    },
                    Some((_, (_, esc_ch))) => {
                        acc.push('\\');
                        acc.push(esc_ch);
                    },
                    None => {
                        return false;
                    },
                }
            },
            Some ((_, (_, inner_ch))) => {
                // Any non-special character inside a double-quote is copied
                // literally just like characters outside of it.
                acc.push(inner_ch);
            },
            None => {
                // The double-quote sequence was not terminated. The entire
                // operation is considered invalid and we have to refuse producing
                // any resulting value.
                return false;
            },
        }
    }
}

fn unquote_open_escape(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) {
    // This decodes an escape sequence outside of any quote. The opening backslash was already
    // parsed by the caller. Both `&source[start]` and `cursor` point to the first character
    // following the opening backslash.
    // Outside of quotes, an escape sequence simply treats the next character literally, and
    // does not interpret it. The exceptions are literal <NL> (newline charcater) and a single
    // backslash as last character in the string. In these cases the escape-sequence is
    // stripped and produces no output. The <NL> case is a remnant of human shell input, where
    // you can input multiple lines by appending a backslash to the previous line. This causes
    // both the backslash and <NL> to be ignore, since they purely serve readability of user
    // input.
    if let Some((_, (_, esc_ch))) = cursor.next() {
        if esc_ch != '\n' {
            acc.push(esc_ch);
        }
    }
}

/// Unquote String
///
/// Unquote a single string according to POSIX Shell quoting and escaping rules. If the input
/// string is not a valid input, the operation will fail and provide diagnosis information on
/// where the first invalid part was encountered.
///
/// The result is canonical. There is only one valid unquoted result for a given input.
///
/// # Examples
///
/// ```
/// assert_eq!(r_shquote::unquote("foobar").unwrap(), "foobar");
/// ```
pub fn unquote(source: &str) -> Result<String, UnquoteError> {
    // An unquote-operation never results in a longer string. Furthermore, the common case is
    // most of the string is unquoted / unescaped. Hence, we simply allocate the same space
    // for the resulting string as the input.
    let mut acc = String::with_capacity(source.len());

    // We loop over the string. When a single-quote, double-quote, or escape sequence is
    // opened, we let out helpers parse the sub-strings. Anything else is copied over
    // literally until the end of the line.
    let mut cursor = source.char_indices().enumerate();
    loop {
        match cursor.next() {
            Some((next_idx, (next_pos, next_ch))) if next_ch == '\'' => {
                if !unquote_open_single(&mut acc, &mut cursor) {
                    break Err(
                        UnquoteError::UnterminatedSingleQuote {
                            char_cursor: next_idx,
                            byte_cursor: next_pos,
                        }
                    );
                }
            },
            Some((next_idx, (next_pos, next_ch))) if next_ch == '"' => {
                if !unquote_open_double(&mut acc, &mut cursor) {
                    break Err(
                        UnquoteError::UnterminatedDoubleQuote {
                            char_cursor: next_idx,
                            byte_cursor: next_pos,
                        }
                    );
                }
            },
            Some((_, (_, next_ch))) if next_ch == '\\' => {
                unquote_open_escape(&mut acc, &mut cursor);
            },
            Some((_, (_, next_ch))) => {
                acc.push(next_ch);
            },
            None => {
                break Ok(acc);
            },
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn basic() {
        assert_eq!(quote("foobar"), "'foobar'");
        assert_eq!(quote(""), "''");
        assert_eq!(quote("'"), "''\\'''");

        assert_eq!(unquote("foobar").unwrap(), "foobar");
        assert_eq!(unquote("foo'bar'").unwrap(), "foobar");
        assert_eq!(unquote("foo\"bar\"").unwrap(), "foobar");
        assert_eq!(unquote("\\foobar\\").unwrap(), "foobar");
        assert_eq!(unquote("\\'foobar\\'").unwrap(), "'foobar'");
    }
}