snailquote/lib.rs
1#[cfg(test)]
2extern crate quickcheck;
3#[cfg(test)]
4#[macro_use(quickcheck)]
5extern crate quickcheck_macros;
6
7extern crate unicode_categories;
8
9use std::borrow::Cow;
10use std::num::ParseIntError;
11use std::{char, str};
12use thiserror::Error;
13use unicode_categories::UnicodeCategories;
14
15/// Escape the provided string with shell-like quoting and escapes.
16/// Strings which do not need to be escaped will be returned unchanged.
17///
18/// # Details
19///
20/// Escape will prefer to avoid quoting when possible. When quotes are required, it will prefer
21/// single quotes (which have simpler semantics, namely no escaping). In all other cases it will
22/// use double quotes and escape whatever characters it needs to.
23///
24/// For the full list of escapes which will be used, see the table in
25/// [unescape](unescape).
26///
27/// # Examples
28/// ```
29/// use snailquote::escape;
30/// # // The println/assert duplication is because I want to show the output you'd get without
31/// # // rust's string quoting/escaping getting in the way
32/// # // Ideally we could just assert on stdout, not duplicate, see
33/// # // https://github.com/rust-lang/rfcs/issues/2270
34/// println!("{}", escape("foo")); // no escapes needed
35/// // foo
36/// # assert_eq!(escape("foo"), "foo");
37/// println!("{}", escape("String with spaces")); // single-quoteable
38/// // 'String with spaces'
39/// # assert_eq!(escape("String with spaces"), "'String with spaces'");
40/// println!("{}", escape("東方")); // no escapes needed
41/// // 東方
42/// # assert_eq!(escape("東方"), "東方");
43/// println!("{}", escape("\"new\nline\"")); // escape needed
44/// // "\"new\nline\""
45/// # assert_eq!(escape("\"new\nline\""), "\"\\\"new\\nline\\\"\"");
46/// ```
47// escape performs some minimal 'shell-like' escaping on a given string
48pub fn escape(s: &str) -> Cow<str> {
49 let mut needs_quoting = false;
50 let mut single_quotable = true;
51
52 for c in s.chars() {
53 let quote = match c {
54 // Special cases, can't be single quoted
55 '\'' | '\\' => {
56 single_quotable = false;
57 true
58 },
59 // ' ' is up here before c.is_whitespace() because it's the only whitespace we can
60 // single quote safely. Things like '\t' need to be escaped.
61 '"' | ' ' => true,
62 // Special characters in shells that can error out or expand if not quoted
63 '(' | ')' | '&' | '~' | '$' | '#' | '`' | ';' => true,
64 // sh globbing chars
65 '*' | '?' | '!' | '[' => true,
66 // redirects / pipes
67 '>' | '<' | '|' => true,
68 c if c.is_whitespace() || c.is_separator() || c.is_other() => {
69 // we need to escape most whitespace (i.e. \t), so we need double quotes.
70 single_quotable = false;
71 true
72 },
73 _ => false,
74 };
75 if quote {
76 needs_quoting = true;
77 }
78 if needs_quoting && !single_quotable {
79 // We know we'll need double quotes, no need to check further
80 break;
81 }
82 }
83
84 if !needs_quoting {
85 return Cow::from(s);
86 }
87 if single_quotable {
88 return format!("'{}'", s).into();
89 }
90 // otherwise we need to double quote it
91
92 let mut output = String::with_capacity(s.len());
93 output.push('"');
94
95 for c in s.chars() {
96 if c == '"' {
97 output += "\\\"";
98 } else if c == '\\' {
99 output += "\\\\";
100 } else if c == ' ' {
101 // avoid 'escape_unicode' for ' ' even though it's a separator
102 output.push(c);
103 } else if c == '$' {
104 output += "\\$";
105 } else if c == '`' {
106 output += "\\`";
107 } else if c.is_other() || c.is_separator() {
108 output += &escape_character(c);
109 } else {
110 output.push(c);
111 }
112 }
113
114 output.push('"');
115 output.into()
116}
117
118// escape_character is an internal helper method which converts the given unicode character into an
119// escape sequence. It is assumed the character passed in *must* be escaped (e.g. it is some non-printable
120// or unusual character).
121// escape_character will prefer more human readable escapes (e.g. '\n' over '\u{0a}'), but will
122// fall back on dumb unicode escaping.
123// It is similar to rust's "char::escape_default", but supports additional escapes that rust does
124// not. For strings that don't contain these unusual characters, it's identical to 'escape_default'.
125fn escape_character(c: char) -> String {
126 match c {
127 '\u{07}' => "\\a".to_string(),
128 '\u{08}' => "\\b".to_string(),
129 '\u{0b}' => "\\v".to_string(),
130 '\u{0c}' => "\\f".to_string(),
131 '\u{1b}' => "\\e".to_string(),
132 c => {
133 // escape_default does the right thing for \t, \r, \n, and unicode
134 c.escape_default().to_string()
135 }
136 }
137}
138
139/// Error type of [unescape](unescape).
140#[derive(Debug, Error, PartialEq)]
141pub enum UnescapeError {
142 #[error("invalid escape {escape} at {index} in {string}")]
143 InvalidEscape {
144 escape: String,
145 index: usize,
146 string: String,
147 },
148 #[error("\\u could not be parsed at {index} in {string}: {source}")]
149 InvalidUnicode {
150 #[source]
151 source: ParseUnicodeError,
152 index: usize,
153 string: String,
154 },
155}
156
157/// Source error type of [UnescapeError::InvalidUnicode](UnescapeError::InvalidUnicode).
158#[derive(Debug, Error, PartialEq)]
159pub enum ParseUnicodeError {
160 #[error("expected '{{' character in unicode escape")]
161 BraceNotFound,
162 #[error("could not parse {string} as u32 hex: {source}")]
163 ParseHexFailed {
164 #[source]
165 source: ParseIntError,
166 string: String,
167 },
168 #[error("could not parse {value} as a unicode char")]
169 ParseUnicodeFailed { value: u32 },
170}
171
172/// Parse the provided shell-like quoted string, such as one produced by [escape](escape).
173///
174/// # Details
175///
176/// Unescape is able to handle single quotes (which cannot contain any additional escapes), double
177/// quotes (which may contain a set of escapes similar to ANSI-C, i.e. '\n', '\r', '\'', etc.
178/// Unescape will also parse unicode escapes of the form "\u{01ff}". See
179/// [char::escape_unicode](std::char::EscapeUnicode) in the Rust standard library for more
180/// information on these escapes.
181///
182/// Multiple different quoting styles may be used in one string, for example, the following string
183/// is valid: `'some spaces'_some_unquoted_"and a \t tab"`.
184///
185/// The full set of supported escapes between double quotes may be found below:
186///
187/// | Escape | Unicode | Description |
188/// |--------|---------|-------------|
189/// | \a | \u{07} | Bell |
190/// | \b | \u{08} | Backspace |
191/// | \v | \u{0B} | Vertical tab |
192/// | \f | \u{0C} | Form feed |
193/// | \n | \u{0A} | Newline |
194/// | \r | \u{0D} | Carriage return |
195/// | \t | \u{09} | Tab
196/// | \e | \u{1B} | Escape |
197/// | \E | \u{1B} | Escape |
198/// | \\ | \u{5C} | Backslash |
199/// | \' | \u{27} | Single quote |
200/// | \" | \u{22} | Double quote |
201/// | \$ | \u{24} | Dollar sign (sh compatibility) |
202/// | \` | \u{60} | Backtick (sh compatibility) |
203/// | \u{XX} | \u{XX} | Unicode character with hex code XX |
204///
205/// # Errors
206///
207/// The returned result can display a human readable error if the string cannot be parsed as a
208/// valid quoted string.
209///
210/// # Examples
211/// ```
212/// use snailquote::unescape;
213/// # // The println/assert duplication is because I want to show the output you'd get without
214/// # // rust's string quoting/escaping getting in the way
215/// # // Ideally we could just assert on stdout, not duplicate, see
216/// # // https://github.com/rust-lang/rfcs/issues/2270
217/// println!("{}", unescape("foo").unwrap());
218/// // foo
219/// # assert_eq!(unescape("foo").unwrap(), "foo");
220/// println!("{}", unescape("'String with spaces'").unwrap());
221/// // String with spaces
222/// # assert_eq!(unescape("'String with spaces'").unwrap(), "String with spaces");
223/// println!("{}", unescape("\"new\\nline\"").unwrap());
224/// // new
225/// // line
226/// # assert_eq!(unescape("\"new\\nline\"").unwrap(), "new\nline");
227/// println!("{}", unescape("'some spaces'_some_unquoted_\"and a \\t tab\"").unwrap());
228/// // some spaces_some_unquoted_and a tab
229/// # assert_eq!(unescape("'some spaces'_some_unquoted_\"and a \\t tab\"").unwrap(), "some spaces_some_unquoted_and a \t tab");
230/// ```
231pub fn unescape(s: &str) -> Result<String, UnescapeError> {
232 let mut in_single_quote = false;
233 let mut in_double_quote = false;
234
235 let mut chars = s.chars().enumerate();
236
237 let mut res = String::with_capacity(s.len());
238
239 while let Some((idx, c)) = chars.next() {
240 // when in a single quote, no escapes are possible
241 if in_single_quote {
242 if c == '\'' {
243 in_single_quote = false;
244 continue;
245 }
246 } else if in_double_quote {
247 if c == '"' {
248 in_double_quote = false;
249 continue;
250 }
251
252 if c == '\\' {
253 match chars.next() {
254 None => {
255 return Err(UnescapeError::InvalidEscape {
256 escape: format!("{}", c),
257 index: idx,
258 string: String::from(s),
259 });
260 }
261 Some((idx, c2)) => {
262 res.push(match c2 {
263 'a' => '\u{07}',
264 'b' => '\u{08}',
265 'v' => '\u{0B}',
266 'f' => '\u{0C}',
267 'n' => '\n',
268 'r' => '\r',
269 't' => '\t',
270 'e' | 'E' => '\u{1B}',
271 '\\' => '\\',
272 '\'' => '\'',
273 '"' => '"',
274 '$' => '$',
275 '`' => '`',
276 ' ' => ' ',
277 'u' => parse_unicode(&mut chars).map_err(|x| {
278 UnescapeError::InvalidUnicode {
279 source: x,
280 index: idx,
281 string: String::from(s),
282 }
283 })?,
284 _ => {
285 return Err(UnescapeError::InvalidEscape {
286 escape: format!("{}{}", c, c2),
287 index: idx,
288 string: String::from(s),
289 });
290 }
291 });
292 continue;
293 }
294 };
295 }
296 } else if c == '\'' {
297 in_single_quote = true;
298 continue;
299 } else if c == '"' {
300 in_double_quote = true;
301 continue;
302 }
303
304 res.push(c);
305 }
306
307 Ok(res)
308}
309
310// parse_unicode takes an iterator over characters and attempts to extract a single unicode
311// character from it.
312// It parses escapes of the form '\u{65b9}', but this internal helper function expects the cursor
313// to be advanced to between the 'u' and '{'.
314// It also expects to be passed an iterator which includes the index for the purpose of advancing
315// it as well, such as is produced by enumerate.
316fn parse_unicode<I>(chars: &mut I) -> Result<char, ParseUnicodeError>
317where
318 I: Iterator<Item = (usize, char)>,
319{
320 match chars.next() {
321 Some((_, '{')) => {}
322 _ => {
323 return Err(ParseUnicodeError::BraceNotFound);
324 }
325 }
326
327 let unicode_seq: String = chars
328 .take_while(|&(_, c)| c != '}')
329 .map(|(_, c)| c)
330 .collect();
331
332 u32::from_str_radix(&unicode_seq, 16)
333 .map_err(|e| ParseUnicodeError::ParseHexFailed {
334 source: e,
335 string: unicode_seq,
336 })
337 .and_then(|u| {
338 char::from_u32(u).ok_or_else(|| ParseUnicodeError::ParseUnicodeFailed { value: u })
339 })
340}
341
342#[cfg(test)]
343mod test {
344 use super::*;
345 use std::io::Read;
346 #[cfg(feature = "unsafe_tests")]
347 use std::process::Command;
348
349 #[test]
350 fn test_escape() {
351 let test_cases = vec![
352 ("東方", "東方"),
353 ("\"'", r#""\"'""#),
354 ("\\", "\"\\\\\""),
355 ("spaces only", "'spaces only'"),
356 ("some\ttabs", "\"some\\ttabs\""),
357 ("💩", "💩"),
358 ("\u{202e}RTL", "\"\\u{202e}RTL\""),
359 ("no\u{202b}space", "\"no\\u{202b}space\""),
360 ("cash $ money $$ \t", "\"cash \\$ money \\$\\$ \\t\""),
361 ("back ` tick `` \t", "\"back \\` tick \\`\\` \\t\""),
362 (
363 "\u{07}\u{08}\u{0b}\u{0c}\u{0a}\u{0d}\u{09}\u{1b}\u{1b}\u{5c}\u{27}\u{22}",
364 "\"\\a\\b\\v\\f\\n\\r\\t\\e\\e\\\\'\\\"\"",
365 ),
366 ("semi;colon", "'semi;colon'"),
367 ];
368
369 for (s, expected) in test_cases {
370 assert_eq!(escape(s), expected);
371 }
372 }
373
374 #[test]
375 fn test_unescape() {
376 assert_eq!(unescape("\"\\u{6771}\\u{65b9}\""), Ok("東方".to_string()));
377 assert_eq!(unescape("東方"), Ok("東方".to_string()));
378 assert_eq!(unescape("\"\\\\\"'\"\"'"), Ok("\\\"\"".to_string()));
379 assert_eq!(unescape("'\"'"), Ok("\"".to_string()));
380 assert_eq!(unescape("'\"'"), Ok("\"".to_string()));
381 // Every escape between double quotes
382 assert_eq!(
383 unescape("\"\\a\\b\\v\\f\\n\\r\\t\\e\\E\\\\\\'\\\"\\u{09}\\$\\`\""),
384 Ok(
385 "\u{07}\u{08}\u{0b}\u{0c}\u{0a}\u{0d}\u{09}\u{1b}\u{1b}\u{5c}\u{27}\u{22}\u{09}$`"
386 .to_string()
387 )
388 );
389 }
390
391 #[test]
392 fn test_unescape_error() {
393 assert_eq!(
394 unescape("\"\\x\""),
395 Err(UnescapeError::InvalidEscape {
396 escape: "\\x".to_string(),
397 index: 2,
398 string: "\"\\x\"".to_string()
399 })
400 );
401 assert_eq!(
402 unescape("\"\\u6771}\""),
403 Err(UnescapeError::InvalidUnicode {
404 source: ParseUnicodeError::BraceNotFound,
405 index: 2,
406 string: "\"\\u6771}\"".to_string()
407 })
408 );
409 // Can't compare ParseIntError directly until 'int_error_matching' becomes stable
410 assert_eq!(
411 format!("{}", unescape("\"\\u{qqqq}\"").err().unwrap()),
412 "\\u could not be parsed at 2 in \"\\u{qqqq}\": could not parse qqqq as u32 hex: invalid digit found in string"
413 );
414 assert_eq!(
415 unescape("\"\\u{ffffffff}\""),
416 Err(UnescapeError::InvalidUnicode {
417 source: ParseUnicodeError::ParseUnicodeFailed { value: 0xffffffff },
418 index: 2,
419 string: "\"\\u{ffffffff}\"".to_string()
420 })
421 );
422 }
423
424 #[test]
425 fn test_round_trip() {
426 let test_cases = vec![
427 "東方",
428 "foo bar baz",
429 "\\",
430 "\0",
431 "\"'",
432 "\"'''''\"()())}{{}{}{{{!////",
433 "foo;bar",
434 ];
435
436 for case in test_cases {
437 assert_eq!(unescape(&escape(case)), Ok(case.to_owned()));
438 }
439 }
440
441 #[quickcheck]
442 fn round_trips(s: String) -> bool {
443 s == unescape(&escape(&s)).unwrap()
444 }
445
446 #[cfg(feature = "unsafe_tests")]
447 #[quickcheck]
448 fn sh_quoting_round_trips(s: String) -> bool {
449 let s = s.replace(|c: char| c.is_ascii_control() || !c.is_ascii(), "");
450 let escaped = escape(&s);
451 println!("escaped '{}' as '{}'", s, escaped);
452 let output = Command::new("sh").args(vec!["-c", &format!("printf '%s' {}", escaped)]).output().unwrap();
453 if !output.status.success() {
454 panic!("printf %s {} did not exit with success", escaped);
455 }
456 let echo_output = String::from_utf8(output.stdout).unwrap();
457 println!("printf gave it back as '{}'", echo_output);
458 echo_output == s
459 }
460
461 #[test]
462 fn test_os_release_parsing() {
463 let tests = vec![
464 ("fedora-19", "Fedora 19 (Schrödinger’s Cat)"),
465 ("fedora-29", "Fedora 29 (Twenty Nine)"),
466 ("gentoo", "Gentoo/Linux"),
467 ("fictional", "Fictional $ OS: ` edition"),
468 ];
469
470 for (file, pretty_name) in tests {
471 let mut data = String::new();
472 std::fs::File::open(format!("./src/testdata/os-releases/{}", file))
473 .unwrap()
474 .read_to_string(&mut data)
475 .unwrap();
476
477 let mut found_prettyname = false;
478 // partial os-release parser
479 for line in data.lines() {
480 if line.trim().starts_with("#") {
481 continue;
482 }
483 let mut iter = line.splitn(2, "=");
484 let key = iter.next().unwrap();
485 let value = iter.next().unwrap();
486 // assert we can parse the value
487 let unescaped = unescape(value).unwrap();
488 if key == "PRETTY_NAME" {
489 assert_eq!(unescaped, pretty_name);
490 found_prettyname = true;
491 }
492 }
493 assert!(
494 found_prettyname,
495 "expected os-release to have 'PRETTY_NAME' key"
496 );
497 }
498 }
499}