r_shquote/lib.rs
1//! POSIX Shell Compatible Argument Parser
2//!
3//! This crate implements POSIX Shell compatible `quote` and `unquote` operations. These allow to
4//! quote arbitrary strings so they are not interpreted by a shell if taken as input. In the same
5//! way it allows unquoting these strings to get back the original input.
6//!
7//! The way this quoting works is mostly standardized by POSIX. However, many existing
8//! implementations support additional features. These are explicitly not supported by this crate,
9//! and it is not the intention of this crate to support these quirks and peculiarities.
10//!
11//! The basic operations provided are [`quote()`] and [`unquote()`], which both take a UTF-8
12//! string as input, and produce the respective output string.
13//!
14//! # Examples
15//!
16//! ```
17//! let str = "Hello World!";
18//!
19//! println!("Quoted input: {}", r_shquote::quote(str));
20//! ```
21//!
22//! Unquote operations can fail when the input is not well defined. The returned error contains
23//! diagnostics to identify where exactly the parser failed:
24//!
25//! ```
26//! let quote = "'foobar";
27//! let res = r_shquote::unquote(quote).unwrap_err();
28//!
29//! println!("Unquote operation failed: {}", res);
30//! ```
31//!
32//! Combining the quote and unquote operation always produces the original input:
33//!
34//! ```
35//! let str = "foo bar";
36//!
37//! assert_eq!(str, r_shquote::unquote(&r_shquote::quote(str)).unwrap());
38//! ```
39
40/// Error information for unquote operations
41///
42/// This error contains diagnostics from an unquote-operation. In particular, it contains the
43/// character and byte offsets of the cursor where the error originated.
44///
45/// # Examples
46///
47/// ```
48/// let quote = "'Hello' 'World!";
49/// let res = r_shquote::unquote(quote).unwrap_err();
50///
51/// match res {
52/// r_shquote::UnquoteError::UnterminatedSingleQuote { char_cursor: x, .. } |
53/// r_shquote::UnquoteError::UnterminatedDoubleQuote { char_cursor: x, .. } => {
54/// println!("Input: {}", quote);
55/// println!(" {}^--- unterminated quote", " ".repeat(x));
56/// },
57/// }
58/// ```
59#[derive(Debug, Clone)]
60pub enum UnquoteError {
61 UnterminatedSingleQuote {
62 char_cursor: usize,
63 byte_cursor: usize,
64 },
65 UnterminatedDoubleQuote {
66 char_cursor: usize,
67 byte_cursor: usize,
68 },
69}
70
71impl std::fmt::Display for UnquoteError {
72 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
73 write!(f, "{:?}", self)
74 }
75}
76
77impl std::error::Error for UnquoteError { }
78
79/// Quote string
80///
81/// This takes a string and quotes it according to POSIX Shell rules. The result can be passed to
82/// POSIX compatible shells and it will be interpreted as a single token. The [`unquote()`]
83/// operation implements the inverse.
84///
85/// Note that there is no canonical way to quote strings. There are infinite ways to quote a
86/// string. This implementation always quotes using sequences of single-quotes. This mimics what a
87/// lot of other implementations do. Furthermore, redundant quotes may be added, even thought a
88/// shorter output would be possible. This is again done to stay compatible with other existing
89/// implementations and make comparisons easier. Nevertheless, a caller must never try to predict
90/// the possible escaping and quoting done by this function.
91///
92/// # Examples
93///
94/// ```
95/// assert_eq!(r_shquote::quote("foobar"), "'foobar'");
96/// ```
97pub fn quote(source: &str) -> String {
98 // This is far from perfect and produces many overly verbose results, for instance:
99 // `'` => `''\'''`
100 // `` => `''`
101 // ...
102 // However, this is done purposefully to make the behavior more inline with other
103 // implementations, and at the same time keep the implementation simple. If an optimized
104 // version is requested, we can always provide alternatives.
105
106 let mut acc = String::with_capacity(source.len() + 2);
107 let mut parts = source.split('\'');
108
109 acc.push('\'');
110
111 if let Some(part) = parts.next() {
112 acc.push_str(part);
113 }
114
115 parts.fold(&mut acc, |acc, part| {
116 acc.push_str("\'\\\'\'");
117 acc.push_str(part);
118 acc
119 });
120
121 acc.push('\'');
122 acc
123}
124
125fn unquote_open_single(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) -> bool {
126 // This decodes a single-quote sequence. The opening single-quote was already parsed by
127 // the caller. Both `&source[start]` and `cursor` point to the first character following
128 // the opening single-quote.
129 // Anything inside the single-quote sequence is copied verbatim to the output until the
130 // next single-quote. No escape sequences are supported, not even a single-quote can be
131 // escaped. However, if the sequence is not terminated, the entire operation is considered
132 // invalid.
133 for i in cursor {
134 match i {
135 (_, (_, c)) if c == '\'' => return true,
136 (_, (_, c)) => acc.push(c),
137 }
138 }
139
140 false
141}
142
143fn unquote_open_double(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) -> bool {
144 // This decodes a double-quote sequence. The opening double-quote was already parsed by
145 // the caller. Both `&source[start]` and `cursor` point to the first character following
146 // the opening double-quote.
147 // A double-quote sequence allows escape-sequences and goes until the closing
148 // double-quote. If the sequence is not terminated, though, the entire operation is
149 // considered invalid.
150 loop {
151 match cursor.next() {
152 Some((_, (_, inner_ch))) if inner_ch == '"' => {
153 // An unescaped double-quote character terminates the double-quote sequence.
154 // It produces no output.
155 return true;
156 },
157 Some((_, (_, inner_ch))) if inner_ch == '\\' => {
158 // Inside a double-quote sequence several escape sequences are allowed. In
159 // general, any unknown sequence is copied verbatim in its entirety including
160 // the backslash. Known sequences produce the escaped character in its output
161 // and makes the parser not interpret it. If the sequence is non-terminated,
162 // it implies that the double-quote sequence is non-terminated and thus
163 // invokes the same behavior, meaning the entire operation is refused.
164 match cursor.next() {
165 Some((_, (_, esc_ch))) if esc_ch == '"' ||
166 esc_ch == '\\' ||
167 esc_ch == '`' ||
168 esc_ch == '$' ||
169 esc_ch == '\n' => {
170 acc.push(esc_ch);
171 },
172 Some((_, (_, esc_ch))) => {
173 acc.push('\\');
174 acc.push(esc_ch);
175 },
176 None => {
177 return false;
178 },
179 }
180 },
181 Some ((_, (_, inner_ch))) => {
182 // Any non-special character inside a double-quote is copied
183 // literally just like characters outside of it.
184 acc.push(inner_ch);
185 },
186 None => {
187 // The double-quote sequence was not terminated. The entire
188 // operation is considered invalid and we have to refuse producing
189 // any resulting value.
190 return false;
191 },
192 }
193 }
194}
195
196fn unquote_open_escape(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) {
197 // This decodes an escape sequence outside of any quote. The opening backslash was already
198 // parsed by the caller. Both `&source[start]` and `cursor` point to the first character
199 // following the opening backslash.
200 // Outside of quotes, an escape sequence simply treats the next character literally, and
201 // does not interpret it. The exceptions are literal <NL> (newline charcater) and a single
202 // backslash as last character in the string. In these cases the escape-sequence is
203 // stripped and produces no output. The <NL> case is a remnant of human shell input, where
204 // you can input multiple lines by appending a backslash to the previous line. This causes
205 // both the backslash and <NL> to be ignore, since they purely serve readability of user
206 // input.
207 if let Some((_, (_, esc_ch))) = cursor.next() {
208 if esc_ch != '\n' {
209 acc.push(esc_ch);
210 }
211 }
212}
213
214/// Unquote String
215///
216/// Unquote a single string according to POSIX Shell quoting and escaping rules. If the input
217/// string is not a valid input, the operation will fail and provide diagnosis information on
218/// where the first invalid part was encountered.
219///
220/// The result is canonical. There is only one valid unquoted result for a given input.
221///
222/// # Examples
223///
224/// ```
225/// assert_eq!(r_shquote::unquote("foobar").unwrap(), "foobar");
226/// ```
227pub fn unquote(source: &str) -> Result<String, UnquoteError> {
228 // An unquote-operation never results in a longer string. Furthermore, the common case is
229 // most of the string is unquoted / unescaped. Hence, we simply allocate the same space
230 // for the resulting string as the input.
231 let mut acc = String::with_capacity(source.len());
232
233 // We loop over the string. When a single-quote, double-quote, or escape sequence is
234 // opened, we let out helpers parse the sub-strings. Anything else is copied over
235 // literally until the end of the line.
236 let mut cursor = source.char_indices().enumerate();
237 loop {
238 match cursor.next() {
239 Some((next_idx, (next_pos, next_ch))) if next_ch == '\'' => {
240 if !unquote_open_single(&mut acc, &mut cursor) {
241 break Err(
242 UnquoteError::UnterminatedSingleQuote {
243 char_cursor: next_idx,
244 byte_cursor: next_pos,
245 }
246 );
247 }
248 },
249 Some((next_idx, (next_pos, next_ch))) if next_ch == '"' => {
250 if !unquote_open_double(&mut acc, &mut cursor) {
251 break Err(
252 UnquoteError::UnterminatedDoubleQuote {
253 char_cursor: next_idx,
254 byte_cursor: next_pos,
255 }
256 );
257 }
258 },
259 Some((_, (_, next_ch))) if next_ch == '\\' => {
260 unquote_open_escape(&mut acc, &mut cursor);
261 },
262 Some((_, (_, next_ch))) => {
263 acc.push(next_ch);
264 },
265 None => {
266 break Ok(acc);
267 },
268 }
269 }
270}
271
272#[cfg(test)]
273mod tests {
274 use super::*;
275
276 #[test]
277 fn basic() {
278 assert_eq!(quote("foobar"), "'foobar'");
279 assert_eq!(quote(""), "''");
280 assert_eq!(quote("'"), "''\\'''");
281
282 assert_eq!(unquote("foobar").unwrap(), "foobar");
283 assert_eq!(unquote("foo'bar'").unwrap(), "foobar");
284 assert_eq!(unquote("foo\"bar\"").unwrap(), "foobar");
285 assert_eq!(unquote("\\foobar\\").unwrap(), "foobar");
286 assert_eq!(unquote("\\'foobar\\'").unwrap(), "'foobar'");
287 }
288}