r_shquote/
lib.rs

1//! POSIX Shell Compatible Argument Parser
2//!
3//! This crate implements POSIX Shell compatible `quote` and `unquote` operations. These allow to
4//! quote arbitrary strings so they are not interpreted by a shell if taken as input. In the same
5//! way it allows unquoting these strings to get back the original input.
6//!
7//! The way this quoting works is mostly standardized by POSIX. However, many existing
8//! implementations support additional features. These are explicitly not supported by this crate,
9//! and it is not the intention of this crate to support these quirks and peculiarities.
10//!
11//! The basic operations provided are [`quote()`] and [`unquote()`], which both take a UTF-8
12//! string as input, and produce the respective output string.
13//!
14//! # Examples
15//!
16//! ```
17//! let str = "Hello World!";
18//!
19//! println!("Quoted input: {}", r_shquote::quote(str));
20//! ```
21//!
22//! Unquote operations can fail when the input is not well defined. The returned error contains
23//! diagnostics to identify where exactly the parser failed:
24//!
25//! ```
26//! let quote = "'foobar";
27//! let res = r_shquote::unquote(quote).unwrap_err();
28//!
29//! println!("Unquote operation failed: {}", res);
30//! ```
31//!
32//! Combining the quote and unquote operation always produces the original input:
33//!
34//! ```
35//! let str = "foo bar";
36//!
37//! assert_eq!(str, r_shquote::unquote(&r_shquote::quote(str)).unwrap());
38//! ```
39
40/// Error information for unquote operations
41///
42/// This error contains diagnostics from an unquote-operation. In particular, it contains the
43/// character and byte offsets of the cursor where the error originated.
44///
45/// # Examples
46///
47/// ```
48/// let quote = "'Hello' 'World!";
49/// let res = r_shquote::unquote(quote).unwrap_err();
50///
51/// match res {
52///     r_shquote::UnquoteError::UnterminatedSingleQuote { char_cursor: x, .. } |
53///     r_shquote::UnquoteError::UnterminatedDoubleQuote { char_cursor: x, .. } => {
54///         println!("Input: {}", quote);
55///         println!("       {}^--- unterminated quote", " ".repeat(x));
56///     },
57/// }
58/// ```
59#[derive(Debug, Clone)]
60pub enum UnquoteError {
61    UnterminatedSingleQuote {
62        char_cursor: usize,
63        byte_cursor: usize,
64    },
65    UnterminatedDoubleQuote {
66        char_cursor: usize,
67        byte_cursor: usize,
68    },
69}
70
71impl std::fmt::Display for UnquoteError {
72    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
73        write!(f, "{:?}", self)
74    }
75}
76
77impl std::error::Error for UnquoteError { }
78
79/// Quote string
80///
81/// This takes a string and quotes it according to POSIX Shell rules. The result can be passed to
82/// POSIX compatible shells and it will be interpreted as a single token. The [`unquote()`]
83/// operation implements the inverse.
84///
85/// Note that there is no canonical way to quote strings. There are infinite ways to quote a
86/// string. This implementation always quotes using sequences of single-quotes. This mimics what a
87/// lot of other implementations do. Furthermore, redundant quotes may be added, even thought a
88/// shorter output would be possible. This is again done to stay compatible with other existing
89/// implementations and make comparisons easier. Nevertheless, a caller must never try to predict
90/// the possible escaping and quoting done by this function.
91///
92/// # Examples
93///
94/// ```
95/// assert_eq!(r_shquote::quote("foobar"), "'foobar'");
96/// ```
97pub fn quote(source: &str) -> String {
98    // This is far from perfect and produces many overly verbose results, for instance:
99    //   `'` => `''\'''`
100    //   `` => `''`
101    //   ...
102    // However, this is done purposefully to make the behavior more inline with other
103    // implementations, and at the same time keep the implementation simple. If an optimized
104    // version is requested, we can always provide alternatives.
105
106    let mut acc = String::with_capacity(source.len() + 2);
107    let mut parts = source.split('\'');
108
109    acc.push('\'');
110
111    if let Some(part) = parts.next() {
112            acc.push_str(part);
113    }
114
115    parts.fold(&mut acc, |acc, part| {
116        acc.push_str("\'\\\'\'");
117        acc.push_str(part);
118        acc
119    });
120
121    acc.push('\'');
122    acc
123}
124
125fn unquote_open_single(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) -> bool {
126    // This decodes a single-quote sequence. The opening single-quote was already parsed by
127    // the caller. Both `&source[start]` and `cursor` point to the first character following
128    // the opening single-quote.
129    // Anything inside the single-quote sequence is copied verbatim to the output until the
130    // next single-quote. No escape sequences are supported, not even a single-quote can be
131    // escaped. However, if the sequence is not terminated, the entire operation is considered
132    // invalid.
133    for i in cursor {
134        match i {
135            (_, (_, c)) if c == '\''    => return true,
136            (_, (_, c))                 => acc.push(c),
137        }
138    }
139
140    false
141}
142
143fn unquote_open_double(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) -> bool {
144    // This decodes a double-quote sequence. The opening double-quote was already parsed by
145    // the caller. Both `&source[start]` and `cursor` point to the first character following
146    // the opening double-quote.
147    // A double-quote sequence allows escape-sequences and goes until the closing
148    // double-quote. If the sequence is not terminated, though, the entire operation is
149    // considered invalid.
150    loop {
151        match cursor.next() {
152            Some((_, (_, inner_ch))) if inner_ch == '"' => {
153                // An unescaped double-quote character terminates the double-quote sequence.
154                // It produces no output.
155                return true;
156            },
157            Some((_, (_, inner_ch))) if inner_ch == '\\' => {
158                // Inside a double-quote sequence several escape sequences are allowed. In
159                // general, any unknown sequence is copied verbatim in its entirety including
160                // the backslash. Known sequences produce the escaped character in its output
161                // and makes the parser not interpret it. If the sequence is non-terminated,
162                // it implies that the double-quote sequence is non-terminated and thus
163                // invokes the same behavior, meaning the entire operation is refused.
164                match cursor.next() {
165                    Some((_, (_, esc_ch))) if esc_ch == '"'  ||
166                                              esc_ch == '\\' ||
167                                              esc_ch == '`'  ||
168                                              esc_ch == '$'  ||
169                                              esc_ch == '\n' => {
170                        acc.push(esc_ch);
171                    },
172                    Some((_, (_, esc_ch))) => {
173                        acc.push('\\');
174                        acc.push(esc_ch);
175                    },
176                    None => {
177                        return false;
178                    },
179                }
180            },
181            Some ((_, (_, inner_ch))) => {
182                // Any non-special character inside a double-quote is copied
183                // literally just like characters outside of it.
184                acc.push(inner_ch);
185            },
186            None => {
187                // The double-quote sequence was not terminated. The entire
188                // operation is considered invalid and we have to refuse producing
189                // any resulting value.
190                return false;
191            },
192        }
193    }
194}
195
196fn unquote_open_escape(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) {
197    // This decodes an escape sequence outside of any quote. The opening backslash was already
198    // parsed by the caller. Both `&source[start]` and `cursor` point to the first character
199    // following the opening backslash.
200    // Outside of quotes, an escape sequence simply treats the next character literally, and
201    // does not interpret it. The exceptions are literal <NL> (newline charcater) and a single
202    // backslash as last character in the string. In these cases the escape-sequence is
203    // stripped and produces no output. The <NL> case is a remnant of human shell input, where
204    // you can input multiple lines by appending a backslash to the previous line. This causes
205    // both the backslash and <NL> to be ignore, since they purely serve readability of user
206    // input.
207    if let Some((_, (_, esc_ch))) = cursor.next() {
208        if esc_ch != '\n' {
209            acc.push(esc_ch);
210        }
211    }
212}
213
214/// Unquote String
215///
216/// Unquote a single string according to POSIX Shell quoting and escaping rules. If the input
217/// string is not a valid input, the operation will fail and provide diagnosis information on
218/// where the first invalid part was encountered.
219///
220/// The result is canonical. There is only one valid unquoted result for a given input.
221///
222/// # Examples
223///
224/// ```
225/// assert_eq!(r_shquote::unquote("foobar").unwrap(), "foobar");
226/// ```
227pub fn unquote(source: &str) -> Result<String, UnquoteError> {
228    // An unquote-operation never results in a longer string. Furthermore, the common case is
229    // most of the string is unquoted / unescaped. Hence, we simply allocate the same space
230    // for the resulting string as the input.
231    let mut acc = String::with_capacity(source.len());
232
233    // We loop over the string. When a single-quote, double-quote, or escape sequence is
234    // opened, we let out helpers parse the sub-strings. Anything else is copied over
235    // literally until the end of the line.
236    let mut cursor = source.char_indices().enumerate();
237    loop {
238        match cursor.next() {
239            Some((next_idx, (next_pos, next_ch))) if next_ch == '\'' => {
240                if !unquote_open_single(&mut acc, &mut cursor) {
241                    break Err(
242                        UnquoteError::UnterminatedSingleQuote {
243                            char_cursor: next_idx,
244                            byte_cursor: next_pos,
245                        }
246                    );
247                }
248            },
249            Some((next_idx, (next_pos, next_ch))) if next_ch == '"' => {
250                if !unquote_open_double(&mut acc, &mut cursor) {
251                    break Err(
252                        UnquoteError::UnterminatedDoubleQuote {
253                            char_cursor: next_idx,
254                            byte_cursor: next_pos,
255                        }
256                    );
257                }
258            },
259            Some((_, (_, next_ch))) if next_ch == '\\' => {
260                unquote_open_escape(&mut acc, &mut cursor);
261            },
262            Some((_, (_, next_ch))) => {
263                acc.push(next_ch);
264            },
265            None => {
266                break Ok(acc);
267            },
268        }
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn basic() {
278        assert_eq!(quote("foobar"), "'foobar'");
279        assert_eq!(quote(""), "''");
280        assert_eq!(quote("'"), "''\\'''");
281
282        assert_eq!(unquote("foobar").unwrap(), "foobar");
283        assert_eq!(unquote("foo'bar'").unwrap(), "foobar");
284        assert_eq!(unquote("foo\"bar\"").unwrap(), "foobar");
285        assert_eq!(unquote("\\foobar\\").unwrap(), "foobar");
286        assert_eq!(unquote("\\'foobar\\'").unwrap(), "'foobar'");
287    }
288}