use std::convert::TryFrom;
use super::errors::{Error, Result};
static BACKSPACE: u8 = 0x08; static FORMFEED: u8 = 0x0C; static LINEFEED: u8 = 0x0A; static CARRIAGE_RETURN: u8 = 0x0D; static HORIZONTAL_TAB: u8 = 0x09;
static SURROGARES_FIRST: u16 = 0xD800;
static SURROGARES_HIGH_LAST: u16 = 0xDBFF;
static SURROGARES_LOW_FIRST: u16 = 0xDC00;
static SURROGARES_LAST: u16 = 0xDFFF;
pub(crate) fn unescape(source: &[u8]) -> Result<String> {
let mut out: Vec<u8> = Vec::with_capacity(source.len());
let mut encoding_tmp = [0u8; 4];
let mut in_escape = false;
let mut in_unicode = false;
let mut unicode_tmp = [0u8; 4];
let mut unicode_tmp_pos: usize = 0;
let mut high_surrogate: Option<u16> = None;
for byte in source {
if *byte <= 0x1F {
return Err(Error::ControlCharacterInString);
}
if in_unicode {
match byte {
b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' => {
unicode_tmp[unicode_tmp_pos] = *byte;
unicode_tmp_pos += 1;
if unicode_tmp_pos == 4 {
let codepoint = hex_decode(unicode_tmp);
if codepoint >= SURROGARES_FIRST && codepoint <= SURROGARES_LAST {
if let Some(high) = high_surrogate {
if codepoint < SURROGARES_LOW_FIRST {
return Err(Error::ExpectedLowSurrogate);
}
let low = codepoint;
let combined = 0x1_0000
+ (((high - 0xD800) as u32) << 10 | (low - 0xDC00) as u32);
let encoded = match char::try_from(combined) {
Ok(c) => c.encode_utf8(&mut encoding_tmp as &mut [u8]),
Err(_) => return Err(Error::InvalidUnicodeCodePoint),
};
out.extend_from_slice(encoded.as_bytes());
high_surrogate = None;
} else {
if codepoint > SURROGARES_HIGH_LAST {
return Err(Error::ExpectedHighSurrogate);
}
high_surrogate = Some(codepoint);
}
} else {
let encoded = match char::try_from(codepoint as u32) {
Ok(c) => c.encode_utf8(&mut encoding_tmp as &mut [u8]),
Err(_) => return Err(Error::InvalidEscape),
};
out.extend_from_slice(encoded.as_bytes());
}
unicode_tmp_pos = 0;
in_unicode = false;
in_escape = false;
}
}
_ => return Err(Error::InvalidEscape),
}
} else if in_escape {
match byte {
b'"' | b'/' | b'\\' => {
out.push(*byte);
in_escape = false;
}
b'b' => {
out.push(BACKSPACE);
in_escape = false;
}
b'f' => {
out.push(FORMFEED);
in_escape = false;
}
b'n' => {
out.push(LINEFEED);
in_escape = false;
}
b'r' => {
out.push(CARRIAGE_RETURN);
in_escape = false;
}
b't' => {
out.push(HORIZONTAL_TAB);
in_escape = false;
}
b'u' => {
in_unicode = true;
}
_ => return Err(Error::InvalidEscape),
}
} else {
if *byte == b'\\' {
in_escape = true;
} else {
if high_surrogate.is_some() {
return Err(Error::LoneSurrogateFound);
}
out.push(*byte);
}
}
}
if in_escape {
return Err(Error::InvalidEscape);
}
if high_surrogate.is_some() {
return Err(Error::LoneSurrogateFound);
}
String::from_utf8(out).map_err(|_| Error::InvalidUnicodeCodePoint)
}
fn hex_decode(a: [u8; 4]) -> u16 {
(hex_decode_4bit(a[0]) as u16) << 12
| (hex_decode_4bit(a[1]) as u16) << 8
| (hex_decode_4bit(a[2]) as u16) << 4
| (hex_decode_4bit(a[3]) as u16)
}
fn hex_decode_4bit(x: u8) -> u8 {
match x {
b'0'..=b'9' => x - 0x30,
b'A'..=b'F' => x - 0x41 + 10,
b'a'..=b'f' => x - 0x61 + 10,
_ => panic!("Non-hex ASCII character found"),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn ue(source: &[u8]) -> String {
unescape(source).unwrap()
}
fn uee(source: &[u8]) -> Error {
unescape(source).unwrap_err()
}
#[test]
fn unescape_works() {
assert_eq!(ue(b""), "".to_string());
assert_eq!(ue(b"a"), "a".to_string());
assert_eq!(ue(b"ab"), "ab".to_string());
assert_eq!(ue(b"abc"), "abc".to_string());
assert_eq!(ue(b"a/c"), "a/c".to_string());
assert_eq!(ue(b"\x20"), " ".to_string()); assert_eq!(ue(b"\xF0\x9F\x91\x8F"), "π".to_string());
assert_eq!(ue(br#"\\"#), "\\".to_string());
assert_eq!(ue(br#"\\\\"#), "\\\\".to_string());
assert_eq!(ue(br#"\\\\\\"#), "\\\\\\".to_string());
assert_eq!(ue(br#"\\\\\\\\"#), "\\\\\\\\".to_string());
assert_eq!(ue(br#"\""#), "\"".to_string());
assert_eq!(ue(br#"\\"#), "\\".to_string());
assert_eq!(ue(br#"\/"#), "/".to_string());
assert_eq!(ue(br#"\b"#), "\x08".to_string());
assert_eq!(ue(br#"\f"#), "\x0C".to_string());
assert_eq!(ue(br#"\n"#), "\n".to_string());
assert_eq!(ue(br#"\r"#), "\r".to_string());
assert_eq!(ue(br#"\t"#), "\t".to_string());
assert_eq!(ue(br#"\"abc"#), "\"abc".to_string());
assert_eq!(ue(br#"\\abc"#), "\\abc".to_string());
assert_eq!(ue(br#"\/abc"#), "/abc".to_string());
assert_eq!(ue(br#"\babc"#), "\x08abc".to_string());
assert_eq!(ue(br#"\fabc"#), "\x0Cabc".to_string());
assert_eq!(ue(br#"\nabc"#), "\nabc".to_string());
assert_eq!(ue(br#"\rabc"#), "\rabc".to_string());
assert_eq!(ue(br#"\tabc"#), "\tabc".to_string());
assert_eq!(ue(br#"abc\""#), "abc\"".to_string());
assert_eq!(ue(br#"abc\\"#), "abc\\".to_string());
assert_eq!(ue(br#"abc\/"#), "abc/".to_string());
assert_eq!(ue(br#"abc\b"#), "abc\x08".to_string());
assert_eq!(ue(br#"abc\f"#), "abc\x0C".to_string());
assert_eq!(ue(br#"abc\n"#), "abc\n".to_string());
assert_eq!(ue(br#"abc\r"#), "abc\r".to_string());
assert_eq!(ue(br#"abc\t"#), "abc\t".to_string());
assert_eq!(ue(br#"xy\"abc"#), "xy\"abc".to_string());
assert_eq!(ue(br#"xy\\abc"#), "xy\\abc".to_string());
assert_eq!(ue(br#"xy\/abc"#), "xy/abc".to_string());
assert_eq!(ue(br#"xy\babc"#), "xy\x08abc".to_string());
assert_eq!(ue(br#"xy\fabc"#), "xy\x0Cabc".to_string());
assert_eq!(ue(br#"xy\nabc"#), "xy\nabc".to_string());
assert_eq!(ue(br#"xy\rabc"#), "xy\rabc".to_string());
assert_eq!(ue(br#"xy\tabc"#), "xy\tabc".to_string());
assert_eq!(ue(br#" \" \" \" "#), " \" \" \" ".to_string());
assert_eq!(ue(br#" \t \n \r "#), " \t \n \r ".to_string());
assert_eq!(ue(br#" \"\"\" "#), " \"\"\" ".to_string());
assert_eq!(ue(br#" \t\n\r "#), " \t\n\r ".to_string());
assert_eq!(ue(br#" \u0001 "#), " \u{0001} ".to_string());
assert_eq!(ue(br#" \u0010 "#), " \u{0010} ".to_string());
assert_eq!(ue(br#" \u0100 "#), " \u{0100} ".to_string());
assert_eq!(ue(br#" \u1000 "#), " \u{1000} ".to_string());
assert_eq!(ue(br#" \uABCD "#), " \u{abcd} ".to_string());
assert_eq!(ue(br#" \uabcd "#), " \u{abcd} ".to_string());
assert_eq!(ue(br#" \uAbCd "#), " \u{abcd} ".to_string());
assert_eq!(ue(br#" \uABCDefg "#), " \u{abcd}efg ".to_string());
assert_eq!(ue(br#" \uabcdefg "#), " \u{abcd}efg ".to_string());
assert_eq!(ue(br#" \uAbCdefg "#), " \u{abcd}efg ".to_string());
}
#[test]
fn unescape_fails_for_control_characters() {
assert_eq!(unescape(b" \x00 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x01 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x02 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x03 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x04 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x05 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x06 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x07 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x08 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x09 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x0a "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x0b "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x0c "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x0d "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x0e "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x0f "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x10 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x11 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x12 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x13 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x14 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x15 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x16 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x17 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x18 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x19 "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x1a "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x1b "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x1c "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x1d "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x1e "), Err(Error::ControlCharacterInString));
assert_eq!(unescape(b" \x1f "), Err(Error::ControlCharacterInString));
}
#[test]
fn unescape_fails_for_invalid_escape_sequence() {
assert_eq!(unescape(br#" \ "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \a "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \N "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \- "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \' "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \x "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \x08 "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \u{7A} "#), Err(Error::InvalidEscape)); assert_eq!(unescape(br#" \uAAA "#), Err(Error::InvalidEscape)); assert_eq!(unescape(br#" \uAA "#), Err(Error::InvalidEscape)); assert_eq!(unescape(br#" \uA "#), Err(Error::InvalidEscape)); assert_eq!(unescape(br#" \u "#), Err(Error::InvalidEscape)); assert_eq!(unescape(br#" \u123. "#), Err(Error::InvalidEscape)); assert_eq!(unescape(br#" \u123g "#), Err(Error::InvalidEscape)); assert_eq!(unescape(br#" \u123\9 "#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \u123"#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \u12"#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \u1"#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \u"#), Err(Error::InvalidEscape));
assert_eq!(unescape(br#" \"#), Err(Error::InvalidEscape));
}
#[test]
fn unescape_works_for_surrogate_pairs() {
assert_eq!(ue(br#" \uD83D\uDC4F "#), " \u{1F44F} ".to_string());
assert_eq!(ue(br#" \uD83D\uDC4F\uD83D\uDC4F "#), " ππ ".to_string());
assert_eq!(ue(br#" \uD83E\uDD7A "#), " \u{1F97A} ".to_string());
assert_eq!(ue(br#" \uD83E\uDD7A\uD83D\uDC4F "#), " π₯Ίπ ".to_string());
}
#[test]
fn unescape_fails_for_broken_surrogates() {
assert_eq!(uee(br#" \uDEAD "#), Error::ExpectedHighSurrogate);
assert_eq!(uee(br#" \uDC4F\uD83D "#), Error::ExpectedHighSurrogate);
assert_eq!(uee(br#" \uD800\uD800 "#), Error::ExpectedLowSurrogate);
}
#[test]
fn unescape_fails_for_lone_surrogates() {
assert_eq!(uee(br#" \uD83Dabc "#), Error::LoneSurrogateFound);
assert_eq!(uee(br#" \uD83D"#), Error::LoneSurrogateFound);
assert_eq!(uee(br#" \uD800\u0001 "#), Error::LoneSurrogateFound);
assert_eq!(uee(br#" \uD800\uD799 "#), Error::LoneSurrogateFound);
assert_eq!(uee(br#" \uD800\uE000 "#), Error::LoneSurrogateFound);
}
#[test]
fn hex_decode_works() {
assert_eq!(hex_decode([b'0', b'0', b'0', b'0']), 0x0000);
assert_eq!(hex_decode([b'0', b'0', b'0', b'1']), 0x0001);
assert_eq!(hex_decode([b'0', b'0', b'1', b'0']), 0x0010);
assert_eq!(hex_decode([b'0', b'1', b'0', b'0']), 0x0100);
assert_eq!(hex_decode([b'1', b'0', b'0', b'0']), 0x1000);
assert_eq!(hex_decode([b'1', b'1', b'1', b'1']), 0x1111);
assert_eq!(hex_decode([b'1', b'1', b'1', b'0']), 0x1110);
assert_eq!(hex_decode([b'1', b'1', b'0', b'1']), 0x1101);
assert_eq!(hex_decode([b'1', b'0', b'1', b'1']), 0x1011);
assert_eq!(hex_decode([b'0', b'1', b'1', b'1']), 0x0111);
assert_eq!(hex_decode([b'2', b'3', b'4', b'5']), 0x2345);
assert_eq!(hex_decode([b'6', b'7', b'8', b'9']), 0x6789);
assert_eq!(hex_decode([b'a', b'b', b'c', b'd']), 0xabcd);
assert_eq!(hex_decode([b'e', b'f', b'A', b'B']), 0xefab);
assert_eq!(hex_decode([b'C', b'D', b'E', b'F']), 0xcdef);
}
}