snailquote/
lib.rs

1#[cfg(test)]
2extern crate quickcheck;
3#[cfg(test)]
4#[macro_use(quickcheck)]
5extern crate quickcheck_macros;
6
7extern crate unicode_categories;
8
9use std::borrow::Cow;
10use std::num::ParseIntError;
11use std::{char, str};
12use thiserror::Error;
13use unicode_categories::UnicodeCategories;
14
15/// Escape the provided string with shell-like quoting and escapes.
16/// Strings which do not need to be escaped will be returned unchanged.
17///
18/// # Details
19///
20/// Escape will prefer to avoid quoting when possible. When quotes are required, it will prefer
21/// single quotes (which have simpler semantics, namely no escaping). In all other cases it will
22/// use double quotes and escape whatever characters it needs to.
23///
24/// For the full list of escapes which will be used, see the table in
25/// [unescape](unescape).
26///
27/// # Examples
28/// ```
29/// use snailquote::escape;
30/// # // The println/assert duplication is because I want to show the output you'd get without
31/// # // rust's string quoting/escaping getting in the way
32/// # // Ideally we could just assert on stdout, not duplicate, see
33/// # // https://github.com/rust-lang/rfcs/issues/2270
34/// println!("{}", escape("foo")); // no escapes needed
35/// // foo
36/// # assert_eq!(escape("foo"), "foo");
37/// println!("{}", escape("String with spaces")); // single-quoteable
38/// // 'String with spaces'
39/// # assert_eq!(escape("String with spaces"), "'String with spaces'");
40/// println!("{}", escape("東方")); // no escapes needed
41/// // 東方
42/// # assert_eq!(escape("東方"), "東方");
43/// println!("{}", escape("\"new\nline\"")); // escape needed
44/// // "\"new\nline\""
45/// # assert_eq!(escape("\"new\nline\""), "\"\\\"new\\nline\\\"\"");
46/// ```
47// escape performs some minimal 'shell-like' escaping on a given string
48pub fn escape(s: &str) -> Cow<str> {
49    let mut needs_quoting = false;
50    let mut single_quotable = true;
51
52    for c in s.chars() {
53        let quote = match c {
54            // Special cases, can't be single quoted
55            '\'' | '\\' => {
56                single_quotable = false;
57                true
58            },
59            // ' ' is up here before c.is_whitespace() because it's the only whitespace we can
60            // single quote safely. Things like '\t' need to be escaped.
61            '"' | ' ' => true,
62            // Special characters in shells that can error out or expand if not quoted
63            '(' | ')' | '&' | '~' | '$' | '#' | '`' | ';' => true,
64            // sh globbing chars
65            '*' | '?' | '!' | '[' => true,
66            // redirects / pipes
67            '>' | '<' | '|' => true,
68            c if c.is_whitespace() || c.is_separator() || c.is_other() => {
69                // we need to escape most whitespace (i.e. \t), so we need double quotes.
70                single_quotable = false;
71                true
72            },
73            _ => false,
74        };
75        if quote {
76            needs_quoting = true;
77        }
78        if needs_quoting && !single_quotable {
79            // We know we'll need double quotes, no need to check further
80            break;
81        }
82    }
83
84    if !needs_quoting {
85        return Cow::from(s);
86    }
87    if single_quotable {
88        return format!("'{}'", s).into();
89    }
90    // otherwise we need to double quote it
91
92    let mut output = String::with_capacity(s.len());
93    output.push('"');
94
95    for c in s.chars() {
96        if c == '"' {
97            output += "\\\"";
98        } else if c == '\\' {
99            output += "\\\\";
100        } else if c == ' ' {
101            // avoid 'escape_unicode' for ' ' even though it's a separator
102            output.push(c);
103        } else if c == '$' {
104            output += "\\$";
105        } else if c == '`' {
106            output += "\\`";
107        } else if c.is_other() || c.is_separator() {
108            output += &escape_character(c);
109        } else {
110            output.push(c);
111        }
112    }
113
114    output.push('"');
115    output.into()
116}
117
118// escape_character is an internal helper method which converts the given unicode character into an
119// escape sequence. It is assumed the character passed in *must* be escaped (e.g. it is some non-printable
120// or unusual character).
121// escape_character will prefer more human readable escapes (e.g. '\n' over '\u{0a}'), but will
122// fall back on dumb unicode escaping.
123// It is similar to rust's "char::escape_default", but supports additional escapes that rust does
124// not. For strings that don't contain these unusual characters, it's identical to 'escape_default'.
125fn escape_character(c: char) -> String {
126    match c {
127        '\u{07}' => "\\a".to_string(),
128        '\u{08}' => "\\b".to_string(),
129        '\u{0b}' => "\\v".to_string(),
130        '\u{0c}' => "\\f".to_string(),
131        '\u{1b}' => "\\e".to_string(),
132        c => {
133            // escape_default does the right thing for \t, \r, \n, and unicode
134            c.escape_default().to_string()
135        }
136    }
137}
138
139/// Error type of [unescape](unescape).
140#[derive(Debug, Error, PartialEq)]
141pub enum UnescapeError {
142    #[error("invalid escape {escape} at {index} in {string}")]
143    InvalidEscape {
144        escape: String,
145        index: usize,
146        string: String,
147    },
148    #[error("\\u could not be parsed at {index} in {string}: {source}")]
149    InvalidUnicode {
150        #[source]
151        source: ParseUnicodeError,
152        index: usize,
153        string: String,
154    },
155}
156
157/// Source error type of [UnescapeError::InvalidUnicode](UnescapeError::InvalidUnicode).
158#[derive(Debug, Error, PartialEq)]
159pub enum ParseUnicodeError {
160    #[error("expected '{{' character in unicode escape")]
161    BraceNotFound,
162    #[error("could not parse {string} as u32 hex: {source}")]
163    ParseHexFailed {
164        #[source]
165        source: ParseIntError,
166        string: String,
167    },
168    #[error("could not parse {value} as a unicode char")]
169    ParseUnicodeFailed { value: u32 },
170}
171
172/// Parse the provided shell-like quoted string, such as one produced by [escape](escape).
173///
174/// # Details
175///
176/// Unescape is able to handle single quotes (which cannot contain any additional escapes), double
177/// quotes (which may contain a set of escapes similar to ANSI-C, i.e. '\n', '\r', '\'', etc.
178/// Unescape will also parse unicode escapes of the form "\u{01ff}". See
179/// [char::escape_unicode](std::char::EscapeUnicode) in the Rust standard library for more
180/// information on these escapes.
181///
182/// Multiple different quoting styles may be used in one string, for example, the following string
183/// is valid: `'some spaces'_some_unquoted_"and a \t tab"`.
184///
185/// The full set of supported escapes between double quotes may be found below:
186///
187/// | Escape | Unicode | Description |
188/// |--------|---------|-------------|
189/// | \a     | \u{07}  | Bell        |
190/// | \b     | \u{08}  | Backspace   |
191/// | \v     | \u{0B}  | Vertical tab |
192/// | \f     | \u{0C}  | Form feed |
193/// | \n     | \u{0A}  | Newline |
194/// | \r     | \u{0D}  | Carriage return |
195/// | \t     | \u{09}  | Tab
196/// | \e     | \u{1B}  | Escape |
197/// | \E     | \u{1B}  | Escape |
198/// | \\     | \u{5C}  | Backslash |
199/// | \'     | \u{27}  | Single quote |
200/// | \"     | \u{22}  | Double quote |
201/// | \$     | \u{24}  | Dollar sign (sh compatibility) |
202/// | \`     | \u{60}  | Backtick (sh compatibility) |
203/// | \u{XX} | \u{XX}  | Unicode character with hex code XX |
204///
205/// # Errors
206///
207/// The returned result can display a human readable error if the string cannot be parsed as a
208/// valid quoted string.
209///
210/// # Examples
211/// ```
212/// use snailquote::unescape;
213/// # // The println/assert duplication is because I want to show the output you'd get without
214/// # // rust's string quoting/escaping getting in the way
215/// # // Ideally we could just assert on stdout, not duplicate, see
216/// # // https://github.com/rust-lang/rfcs/issues/2270
217/// println!("{}", unescape("foo").unwrap());
218/// // foo
219/// # assert_eq!(unescape("foo").unwrap(), "foo");
220/// println!("{}", unescape("'String with spaces'").unwrap());
221/// // String with spaces
222/// # assert_eq!(unescape("'String with spaces'").unwrap(), "String with spaces");
223/// println!("{}", unescape("\"new\\nline\"").unwrap());
224/// // new
225/// // line
226/// # assert_eq!(unescape("\"new\\nline\"").unwrap(), "new\nline");
227/// println!("{}", unescape("'some spaces'_some_unquoted_\"and a \\t tab\"").unwrap());
228/// // some spaces_some_unquoted_and a 	 tab
229/// # assert_eq!(unescape("'some spaces'_some_unquoted_\"and a \\t tab\"").unwrap(), "some spaces_some_unquoted_and a \t tab");
230/// ```
231pub fn unescape(s: &str) -> Result<String, UnescapeError> {
232    let mut in_single_quote = false;
233    let mut in_double_quote = false;
234
235    let mut chars = s.chars().enumerate();
236
237    let mut res = String::with_capacity(s.len());
238
239    while let Some((idx, c)) = chars.next() {
240        // when in a single quote, no escapes are possible
241        if in_single_quote {
242            if c == '\'' {
243                in_single_quote = false;
244                continue;
245            }
246        } else if in_double_quote {
247            if c == '"' {
248                in_double_quote = false;
249                continue;
250            }
251
252            if c == '\\' {
253                match chars.next() {
254                    None => {
255                        return Err(UnescapeError::InvalidEscape {
256                            escape: format!("{}", c),
257                            index: idx,
258                            string: String::from(s),
259                        });
260                    }
261                    Some((idx, c2)) => {
262                        res.push(match c2 {
263                            'a' => '\u{07}',
264                            'b' => '\u{08}',
265                            'v' => '\u{0B}',
266                            'f' => '\u{0C}',
267                            'n' => '\n',
268                            'r' => '\r',
269                            't' => '\t',
270                            'e' | 'E' => '\u{1B}',
271                            '\\' => '\\',
272                            '\'' => '\'',
273                            '"' => '"',
274                            '$' => '$',
275                            '`' => '`',
276                            ' ' => ' ',
277                            'u' => parse_unicode(&mut chars).map_err(|x| {
278                                UnescapeError::InvalidUnicode {
279                                    source: x,
280                                    index: idx,
281                                    string: String::from(s),
282                                }
283                            })?,
284                            _ => {
285                                return Err(UnescapeError::InvalidEscape {
286                                    escape: format!("{}{}", c, c2),
287                                    index: idx,
288                                    string: String::from(s),
289                                });
290                            }
291                        });
292                        continue;
293                    }
294                };
295            }
296        } else if c == '\'' {
297            in_single_quote = true;
298            continue;
299        } else if c == '"' {
300            in_double_quote = true;
301            continue;
302        }
303
304        res.push(c);
305    }
306
307    Ok(res)
308}
309
310// parse_unicode takes an iterator over characters and attempts to extract a single unicode
311// character from it.
312// It parses escapes of the form '\u{65b9}', but this internal helper function expects the cursor
313// to be advanced to between the 'u' and '{'.
314// It also expects to be passed an iterator which includes the index for the purpose of advancing
315// it  as well, such as is produced by enumerate.
316fn parse_unicode<I>(chars: &mut I) -> Result<char, ParseUnicodeError>
317where
318    I: Iterator<Item = (usize, char)>,
319{
320    match chars.next() {
321        Some((_, '{')) => {}
322        _ => {
323            return Err(ParseUnicodeError::BraceNotFound);
324        }
325    }
326
327    let unicode_seq: String = chars
328        .take_while(|&(_, c)| c != '}')
329        .map(|(_, c)| c)
330        .collect();
331
332    u32::from_str_radix(&unicode_seq, 16)
333        .map_err(|e| ParseUnicodeError::ParseHexFailed {
334            source: e,
335            string: unicode_seq,
336        })
337        .and_then(|u| {
338            char::from_u32(u).ok_or_else(|| ParseUnicodeError::ParseUnicodeFailed { value: u })
339        })
340}
341
342#[cfg(test)]
343mod test {
344    use super::*;
345    use std::io::Read;
346    #[cfg(feature = "unsafe_tests")]
347    use std::process::Command;
348
349    #[test]
350    fn test_escape() {
351        let test_cases = vec![
352            ("東方", "東方"),
353            ("\"'", r#""\"'""#),
354            ("\\", "\"\\\\\""),
355            ("spaces only", "'spaces only'"),
356            ("some\ttabs", "\"some\\ttabs\""),
357            ("💩", "💩"),
358            ("\u{202e}RTL", "\"\\u{202e}RTL\""),
359            ("no\u{202b}space", "\"no\\u{202b}space\""),
360            ("cash $ money $$ \t", "\"cash \\$ money \\$\\$ \\t\""),
361            ("back ` tick `` \t", "\"back \\` tick \\`\\` \\t\""),
362            (
363                "\u{07}\u{08}\u{0b}\u{0c}\u{0a}\u{0d}\u{09}\u{1b}\u{1b}\u{5c}\u{27}\u{22}",
364                "\"\\a\\b\\v\\f\\n\\r\\t\\e\\e\\\\'\\\"\"",
365            ),
366            ("semi;colon", "'semi;colon'"),
367        ];
368
369        for (s, expected) in test_cases {
370            assert_eq!(escape(s), expected);
371        }
372    }
373
374    #[test]
375    fn test_unescape() {
376        assert_eq!(unescape("\"\\u{6771}\\u{65b9}\""), Ok("東方".to_string()));
377        assert_eq!(unescape("東方"), Ok("東方".to_string()));
378        assert_eq!(unescape("\"\\\\\"'\"\"'"), Ok("\\\"\"".to_string()));
379        assert_eq!(unescape("'\"'"), Ok("\"".to_string()));
380        assert_eq!(unescape("'\"'"), Ok("\"".to_string()));
381        // Every escape between double quotes
382        assert_eq!(
383            unescape("\"\\a\\b\\v\\f\\n\\r\\t\\e\\E\\\\\\'\\\"\\u{09}\\$\\`\""),
384            Ok(
385                "\u{07}\u{08}\u{0b}\u{0c}\u{0a}\u{0d}\u{09}\u{1b}\u{1b}\u{5c}\u{27}\u{22}\u{09}$`"
386                    .to_string()
387            )
388        );
389    }
390
391    #[test]
392    fn test_unescape_error() {
393        assert_eq!(
394            unescape("\"\\x\""),
395            Err(UnescapeError::InvalidEscape {
396                escape: "\\x".to_string(),
397                index: 2,
398                string: "\"\\x\"".to_string()
399            })
400        );
401        assert_eq!(
402            unescape("\"\\u6771}\""),
403            Err(UnescapeError::InvalidUnicode {
404                source: ParseUnicodeError::BraceNotFound,
405                index: 2,
406                string: "\"\\u6771}\"".to_string()
407            })
408        );
409        // Can't compare ParseIntError directly until 'int_error_matching' becomes stable
410        assert_eq!(
411            format!("{}", unescape("\"\\u{qqqq}\"").err().unwrap()),
412            "\\u could not be parsed at 2 in \"\\u{qqqq}\": could not parse qqqq as u32 hex: invalid digit found in string"
413        );
414        assert_eq!(
415            unescape("\"\\u{ffffffff}\""),
416            Err(UnescapeError::InvalidUnicode {
417                source: ParseUnicodeError::ParseUnicodeFailed { value: 0xffffffff },
418                index: 2,
419                string: "\"\\u{ffffffff}\"".to_string()
420            })
421        );
422    }
423
424    #[test]
425    fn test_round_trip() {
426        let test_cases = vec![
427            "東方",
428            "foo bar baz",
429            "\\",
430            "\0",
431            "\"'",
432            "\"'''''\"()())}{{}{}{{{!////",
433            "foo;bar",
434        ];
435
436        for case in test_cases {
437            assert_eq!(unescape(&escape(case)), Ok(case.to_owned()));
438        }
439    }
440
441    #[quickcheck]
442    fn round_trips(s: String) -> bool {
443        s == unescape(&escape(&s)).unwrap()
444    }
445
446    #[cfg(feature = "unsafe_tests")]
447    #[quickcheck]
448    fn sh_quoting_round_trips(s: String) -> bool {
449        let s = s.replace(|c: char| c.is_ascii_control() || !c.is_ascii(), "");
450        let escaped = escape(&s);
451        println!("escaped '{}' as '{}'", s, escaped);
452        let output = Command::new("sh").args(vec!["-c", &format!("printf '%s' {}", escaped)]).output().unwrap();
453        if !output.status.success() {
454            panic!("printf %s {} did not exit with success", escaped); 
455        }
456        let echo_output = String::from_utf8(output.stdout).unwrap();
457        println!("printf gave it back as '{}'", echo_output);
458        echo_output == s
459    }
460
461    #[test]
462    fn test_os_release_parsing() {
463        let tests = vec![
464            ("fedora-19", "Fedora 19 (Schrödinger’s Cat)"),
465            ("fedora-29", "Fedora 29 (Twenty Nine)"),
466            ("gentoo", "Gentoo/Linux"),
467            ("fictional", "Fictional $ OS: ` edition"),
468        ];
469
470        for (file, pretty_name) in tests {
471            let mut data = String::new();
472            std::fs::File::open(format!("./src/testdata/os-releases/{}", file))
473                .unwrap()
474                .read_to_string(&mut data)
475                .unwrap();
476
477            let mut found_prettyname = false;
478            // partial os-release parser
479            for line in data.lines() {
480                if line.trim().starts_with("#") {
481                    continue;
482                }
483                let mut iter = line.splitn(2, "=");
484                let key = iter.next().unwrap();
485                let value = iter.next().unwrap();
486                // assert we can parse the value
487                let unescaped = unescape(value).unwrap();
488                if key == "PRETTY_NAME" {
489                    assert_eq!(unescaped, pretty_name);
490                    found_prettyname = true;
491                }
492            }
493            assert!(
494                found_prettyname,
495                "expected os-release to have 'PRETTY_NAME' key"
496            );
497        }
498    }
499}