#![deny(rust_2018_idioms)]
#![deny(rust_2021_compatibility)]
#![deny(missing_docs)]
use std::iter::Peekable;
use std::io::Write;
pub fn pretty_bytes(bs: &[u8]) -> String {
bs
.iter()
.map(|byte| format!("{:02X}", byte))
.collect::<Vec<String>>()
.join(" ")
}
pub fn pretty_string(bs: &[u8]) -> String {
String::from_utf8_lossy(bs).chars()
.map(|c| match c {
'\u{0}'..='\u{20}' => char::from_u32((c as u32) + 0x2400u32).expect("Unicode code points 0x2400-2420 are valid."),
'\u{7F}' => '\u{247F}',
_ => c,
}).collect()
}
#[derive(Debug)]
pub enum InvalidBackslashKind {
RustStyleUnicodeMissingCloseBrace,
RustStyleUnicodeMissingDigits,
UnicodeEscapeBadCodepoint,
HexDigitsNotUnicode,
HexDigitsNotHexDigits(Vec<u8>),
HexDigitsNoDigits,
OctalDigitsNotUnicode,
OctalDigitsNotOctalDigits,
UnicodeEscapeNoDigits,
UnicodeEscapeEndOfString,
ControlEscapeBadKey,
ControlEscapeEndOfString,
BackslashEscapeUnknown,
BackslashEndOfString,
}
use InvalidBackslashKind::*;
#[derive(Debug)]
pub enum UnescapeError
where
UnescapeError: Send,
UnescapeError: Sync,
UnescapeError: 'static,
UnescapeError: std::fmt::Display,
UnescapeError: std::error::Error,
{
InvalidBackslash {
kind: InvalidBackslashKind,
offset: usize,
string: String,
bytes: String,
},
MissingClose {
string: String,
bytes: String,
},
IOError(std::io::Error),
}
impl std::fmt::Display for UnescapeError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidBackslash{kind, offset, string, bytes} => write!(f, "Invalid backslash ({:?}) at byte {}: {} ({})", kind, offset, string, bytes),
Self::MissingClose{string, bytes} => write!(f, "Reached end of string while looking for closing delimiter byte {} ({})", string, bytes),
Self::IOError(e) => write!(f, "While unescaping: {e}"),
}
}
}
impl UnescapeError {
pub fn missing_close(byte: u8) -> Self {
return Self::MissingClose {
string: pretty_string(&[byte]),
bytes: pretty_bytes(&[byte]),
};
}
pub fn invalid_backslash(
offset: usize,
bytes: &[u8],
kind: InvalidBackslashKind,
) -> Self {
return Self::InvalidBackslash {
kind: kind,
offset: offset,
string: pretty_string(bytes),
bytes: pretty_bytes(bytes),
}
}
}
impl From<std::io::Error> for UnescapeError {
fn from(error: std::io::Error) -> Self {
UnescapeError::IOError(error)
}
}
impl std::error::Error for UnescapeError {
}
fn unhex<'a>(
offset: usize,
escape: &[u8],
start: usize,
end: Option<usize>,
) -> Result<Vec<u8>, UnescapeError>
{
let range = match end {
Some(i) => escape[start..=i].to_vec(),
None => escape[start..].to_vec(),
};
let hex: String = match String::from_utf8(range.clone()) {
Ok(s) => s,
Err(_) => { return Err(UnescapeError::invalid_backslash(offset, &escape, HexDigitsNotUnicode)); }
};
let ord: u32 = match u32::from_str_radix(&hex, 16) {
Ok(b) => b,
Err(_) => { return Err(UnescapeError::invalid_backslash(offset, &escape, HexDigitsNotHexDigits(range))); }
};
let out_char: char = match char::from_u32(ord) {
Some(c) => c,
None => {
return Err(UnescapeError::invalid_backslash(offset, &escape, UnicodeEscapeBadCodepoint));
}
};
let mut s = String::with_capacity(8);
s.push(out_char);
return Ok(s.into_bytes());
}
fn un_rust_style_u<'a, I>(
bytes: &mut Peekable<I>,
offset: usize,
escape: &mut Vec<u8>,
) -> Result<Vec<u8>, UnescapeError>
where
I: Iterator<Item = (usize, &'a u8)>,
I: ExactSizeIterator<Item = (usize, &'a u8)>,
{
let mut found_close = false;
while let Some((_, &byte4)) = bytes.next() {
escape.push(byte4);
if byte4 == b'}' {
found_close = true;
break;
}
}
if ! found_close {
return Err(UnescapeError::invalid_backslash(offset, &escape, RustStyleUnicodeMissingCloseBrace));
}
let end = escape.len()-2;
let start = 3;
if end == start-1 {
return Err(UnescapeError::invalid_backslash(offset, &escape, RustStyleUnicodeMissingDigits));
} else if end < start {
unreachable!();
}
return Ok(unhex(offset, &escape, start, Some(end))?);
}
pub fn unescape_iter<'a, I, O>(
bytes: &mut Peekable<I>,
out: &mut O,
close: Option<u8>
) -> Result<usize, UnescapeError>
where
I: Iterator<Item = (usize, &'a u8)>,
I: ExactSizeIterator<Item = (usize, &'a u8)>,
O: Write,
{
let close_delimiter: u8;
let have_close: bool;
match close {
Some(b) => {
close_delimiter = b;
have_close = true;
}
None => {
close_delimiter = 0;
have_close = false;
}
}
let mut last_offset: Option<usize> = None;
while let Some((offset, &byte)) = bytes.next() {
if byte == b'\\' {
let mut escape: Vec<u8> = Vec::with_capacity(12);
escape.push(byte);
if let Some((_, &byte2)) = bytes.next() {
escape.push(byte2);
let _wrote = match byte2 {
b'a' => out.write(&[0x07])?, b'b' => out.write(&[0x08])?, b'e' | b'E' => out.write(&[0x1B])?, b'f' => out.write(&[0x0C])?, b'n' => out.write(&[0x0A])?, b'r' => out.write(&[0x0D])?, b't' => out.write(&[0x09])?, b'v' => out.write(&[0x0B])?, b'\'' => out.write(&[b'\''])?, b'"' => out.write(&[b'"'])?, b'\\' => out.write(&[b'\\'])?, b'0'..=b'9' => {
for _ in 3..=4 {
if let Some((_, &byte3)) = bytes.peek() {
if byte3.is_ascii_digit() {
escape.push(byte3);
}
let (_, _) = bytes.next().expect("Just peeked, so this should never return None.");
}
}
let octal: String = match String::from_utf8(escape[1..].to_vec()) {
Ok(s) => s,
Err(_) => { return Err(UnescapeError::invalid_backslash(offset, &escape, OctalDigitsNotUnicode)); }
};
let out_byte: u8 = match u8::from_str_radix(&octal, 8) {
Ok(b) => b,
Err(_) => { return Err(UnescapeError::invalid_backslash(offset, &escape, OctalDigitsNotOctalDigits)); }
};
out.write(&[out_byte])?
}
b'x' => { for _ in 3..=4 {
if let Some((_, &byte3)) = bytes.peek() {
if byte3.is_ascii_hexdigit() {
escape.push(byte3);
}
let (_, _) = bytes.next().expect("Just peeked, so this should never return None.");
}
}
if escape.len() == 2 { return Err(UnescapeError::invalid_backslash(offset, &escape, HexDigitsNoDigits));
}
let hex: String = match String::from_utf8(escape[2..].to_vec()) {
Ok(s) => s,
Err(_) => { return Err(UnescapeError::invalid_backslash(offset, &escape, HexDigitsNotUnicode)); }
};
let out_byte: u8 = match u8::from_str_radix(&hex, 16) {
Ok(b) => b,
Err(_) => { return Err(UnescapeError::invalid_backslash(offset, &escape, HexDigitsNotHexDigits(hex.as_bytes().to_vec()))); }
};
out.write(&[out_byte])?
}
b'u' => {
if let Some((_, &byte3)) = bytes.next() {
escape.push(byte3);
if byte3 == b'{' {
let u_bytes: Vec<u8> = un_rust_style_u(bytes, offset, &mut escape)?;
out.write(&u_bytes.as_slice())?
} else {
if ! byte3.is_ascii_hexdigit() {
return Err(UnescapeError::invalid_backslash(offset, &escape, UnicodeEscapeNoDigits));
}
for _ in 4..=6 {
if let Some((_, &byte4)) = bytes.peek() {
if byte3.is_ascii_hexdigit() {
escape.push(byte4);
}
let (_, _) = bytes.next().expect("Just peeked, so this should never return None.");
}
}
let utf8 = unhex(offset, &escape, 2, None)?;
out.write(&utf8.as_slice())?
}
} else {
return Err(UnescapeError::invalid_backslash(offset, &escape, UnicodeEscapeEndOfString));
}
}
b'U' => {
if let Some((_, &byte3)) = bytes.next() {
escape.push(byte3);
if ! byte3.is_ascii_hexdigit() {
return Err(UnescapeError::invalid_backslash(offset, &escape, UnicodeEscapeNoDigits));
}
for _ in 4..=10 {
if let Some((_, &byte4)) = bytes.peek() {
if byte3.is_ascii_hexdigit() {
escape.push(byte4);
}
let (_, _) = bytes.next().expect("Just peeked, so this should never return None.");
}
}
let utf8 = unhex(offset, &escape, 2, None)?;
out.write(&utf8.as_slice())?
} else {
return Err(UnescapeError::invalid_backslash(offset, &escape, UnicodeEscapeEndOfString));
}
}
b'c' => {
if let Some((_, &byte3)) = bytes.next() {
escape.push(byte3);
if (b'@'..=b'_').contains(&byte3) {
out.write(&[byte3-0x40].as_slice())?
} else if (b'`'..=b'~').contains(&byte3) {
out.write(&[byte3-0x60].as_slice())?
} else {
return Err(UnescapeError::invalid_backslash(offset, &escape, ControlEscapeBadKey));
}
} else {
return Err(UnescapeError::invalid_backslash(offset, &escape, ControlEscapeEndOfString));
}
}
_ => { return Err(UnescapeError::invalid_backslash(offset, &escape, BackslashEscapeUnknown)); }
};
} else {
UnescapeError::invalid_backslash(offset, &escape, BackslashEndOfString);
}
} else if have_close && byte == close_delimiter {
return Ok(offset);
} else {
out.write(&[byte])?;
}
last_offset = Some(offset);
}
if have_close {
Err(UnescapeError::missing_close(close_delimiter))
} else {
return Ok(last_offset.expect("If last_offset isn't set by now, it's a bug."));
}
}
pub fn unescape_bytes(
bytes: &[u8],
) -> Result<Vec<u8>, UnescapeError> {
let mut r: Vec<u8> = Vec::with_capacity(bytes.len());
unescape_iter(&mut bytes.iter().enumerate().peekable(), &mut r, None)?;
return Ok(r);
}
#[cfg(test)]
mod tests;