use std::fmt;
use std::fmt::Write;
pub struct UnescapeError {
index: usize,
codepoint_chars: [u8; 4],
error: UnicodeError,
}
enum UnicodeError {
UnexpectedLowSurrogate,
UnmatchedHighSurrogate,
}
impl fmt::Display for UnescapeError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "unescaping error at char {}: ", self.index)?;
let codepoint_chars = std::str::from_utf8(&self.codepoint_chars).unwrap();
match &self.error {
UnicodeError::UnexpectedLowSurrogate => {
write!(f, "unexpected low surrogate \"\\u{codepoint_chars}\"")
}
UnicodeError::UnmatchedHighSurrogate => write!(
f,
"high surrogate \"\\u{codepoint_chars}\" not followed by low surrogate"
),
}
}
}
enum DecodedCodepoint {
Char(char),
LowSurrogate(u16),
HighSurrogate(u16),
}
pub fn unescape_json_string(s: &str) -> Result<String, UnescapeError> {
let mut chars = s.chars();
let mut unescaped = String::with_capacity(s.len());
let mut index = 1;
while let Some(ch) = chars.next() {
index += 1;
if ch != '\\' {
if is_control(ch) {
unescaped.push_str("\\u00");
write!(unescaped, "{:02X}", ch as u32).unwrap();
} else {
unescaped.push(ch);
}
continue;
}
let escaped = chars.next().unwrap();
index += 1;
match escaped {
'"' => unescaped.push('"'),
'\\' => unescaped.push('\\'),
'/' => unescaped.push('/'),
'b' => unescaped.push_str("\\b"),
'f' => unescaped.push('\x0c'),
'n' => unescaped.push('\n'),
'r' => unescaped.push('\r'),
't' => unescaped.push('\t'),
'u' => {
let (codepoint, codepoint_chars) = parse_codepoint_from_chars(&mut chars);
index += 4;
match decode_codepoint(codepoint) {
DecodedCodepoint::Char(ch) => {
if is_control(ch) {
unescaped.push_str("\\u");
unescaped.push(codepoint_chars[0] as char);
unescaped.push(codepoint_chars[1] as char);
unescaped.push(codepoint_chars[2] as char);
unescaped.push(codepoint_chars[3] as char);
} else {
unescaped.push(ch)
}
}
DecodedCodepoint::LowSurrogate(_) => {
return Err(UnescapeError {
index: index - 6,
codepoint_chars,
error: UnicodeError::UnexpectedLowSurrogate,
});
}
DecodedCodepoint::HighSurrogate(hs) => match (chars.next(), chars.next()) {
(Some('\\'), Some('u')) => {
index += 2;
let (codepoint, _) = parse_codepoint_from_chars(&mut chars);
index += 4;
match decode_codepoint(codepoint) {
DecodedCodepoint::LowSurrogate(ls) => {
let codepoint = (hs as u32) * 0x400 + (ls as u32) + 0x10000;
unescaped.push(char::from_u32(codepoint).unwrap());
}
_ => {
return Err(UnescapeError {
index,
codepoint_chars,
error: UnicodeError::UnmatchedHighSurrogate,
});
}
}
}
_ => {
return Err(UnescapeError {
index,
codepoint_chars,
error: UnicodeError::UnmatchedHighSurrogate,
});
}
},
}
}
_ => panic!("Unexpected escape character in JSON string: {}", ch),
}
}
Ok(unescaped)
}
fn is_control(ch: char) -> bool {
matches!(ch as u32, 0x00..=0x1F | 0x7F..=0x9F)
}
fn parse_codepoint_from_chars(chars: &mut std::str::Chars<'_>) -> (u16, [u8; 4]) {
let mut codepoint = 0;
let chars = [
chars.next().unwrap(),
chars.next().unwrap(),
chars.next().unwrap(),
chars.next().unwrap(),
];
let utf8_chars = [
chars[0] as u8,
chars[1] as u8,
chars[2] as u8,
chars[3] as u8,
];
codepoint += hex_char_to_int(chars[0]) * 0x1000;
codepoint += hex_char_to_int(chars[1]) * 0x0100;
codepoint += hex_char_to_int(chars[2]) * 0x0010;
codepoint += hex_char_to_int(chars[3]);
(codepoint, utf8_chars)
}
fn hex_char_to_int(ch: char) -> u16 {
match ch {
'0'..='9' => (ch as u16) - ('0' as u16),
'a'..='f' => (ch as u16) - ('a' as u16) + 10,
'A'..='f' => (ch as u16) - ('A' as u16) + 10,
_ => panic!("Unexpected non-hex digit: {}", ch),
}
}
fn decode_codepoint(codepoint: u16) -> DecodedCodepoint {
match codepoint {
0xD800..=0xDBFF => DecodedCodepoint::HighSurrogate(codepoint - 0xD800),
0xDC00..=0xDFFF => DecodedCodepoint::LowSurrogate(codepoint - 0xDC00),
_ => DecodedCodepoint::Char(char::from_u32(codepoint as u32).unwrap()),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[track_caller]
fn check(escaped: &str, expected_unescaped: &str) {
let unescaped = match unescape_json_string(escaped) {
Ok(s) => s,
Err(err) => format!("ERR: {err}"),
};
assert_eq!(expected_unescaped, &unescaped);
}
#[test]
fn test_unescape_json_string() {
check("abc", "abc");
check("abc \\\\ \\\"", "abc \\ \"");
check("abc \\n \\t \\r", "abc \n \t \r");
check("abc \\n \\t \\r", "abc \n \t \r");
check("€ \\u20AC", "€ \u{20AC}");
check("𐐷 \\uD801\\uDC37", "𐐷 \u{10437}");
check("12x\\b34", "12x\\b34");
check(
"\\u0000 | \\u001f | \\u0020 | \\u007e | \\u007f | \\u0080 | \\u009F | \\u00a0",
"\\u0000 | \\u001f | \u{0020} | \u{007e} | \\u007f | \\u0080 | \\u009F | \u{00a0}",
);
check("12 \u{0080} 34", "12 \\u0080 34");
check(
"abc 𐐷 \\uD801\\uDC37 \\uD801",
"ERR: unescaping error at char 26: high surrogate \"\\uD801\" not followed by low surrogate",
);
check(
"abc 𐐷 \\uD801\\uDC37 \\uDC37",
"ERR: unescaping error at char 20: unexpected low surrogate \"\\uDC37\"",
);
}
}