use std::char;
use std::cmp::min;
use std::iter::Peekable;
include!(concat!(env!("OUT_DIR"), "/entities.rs"));
pub fn unescape<S: AsRef<[u8]>>(escaped: S) -> String {
let escaped = escaped.as_ref();
let mut iter = escaped.iter().peekable();
let mut buffer = Vec::new();
while let Some(c) = iter.next() {
if *c == b'&' {
let mut expansion = match_entity(&mut iter);
buffer.append(&mut expansion);
} else {
buffer.push(*c);
}
}
String::from_utf8(buffer).unwrap()
}
const PEEK_MATCH_ERROR: &str = "iter.next() did not match previous iter.peek()";
#[allow(clippy::from_str_radix_10)]
fn match_numeric_entity<'a, I>(iter: &mut Peekable<I>) -> Vec<u8>
where
I: Iterator<Item = &'a u8>,
{
let c = iter.next().expect(PEEK_MATCH_ERROR);
if *c != b'#' {
panic!("{}", PEEK_MATCH_ERROR);
}
let mut best_expansion = vec![b'&', b'#'];
let number = match iter.peek() {
Some(&b'x') | Some(&b'X') => {
best_expansion.push(*iter.next().expect(PEEK_MATCH_ERROR));
let hex = consume_hexadecimal(iter);
best_expansion.extend_from_slice(&hex);
u32::from_str_radix(&String::from_utf8(hex).unwrap(), 16)
}
Some(_) => {
let dec = consume_decimal(iter);
best_expansion.extend_from_slice(&dec);
u32::from_str_radix(&String::from_utf8(dec).unwrap(), 10)
}
None => {
return best_expansion;
}
};
if let Some(&b';') = iter.peek() {
best_expansion.push(*iter.next().expect(PEEK_MATCH_ERROR));
} else {
}
if let Ok(number) = number {
if let Some(expansion) = correct_numeric_entity(number) {
return expansion;
}
}
best_expansion
}
pub const REPLACEMENT_CHAR: char = '\u{fffd}';
fn is_noncharacter<C: Into<u32>>(c: C) -> bool {
matches!(
c.into(),
(0xFDD0..=0xFDEF)
| 0xFFFE
| 0xFFFF
| 0x1FFFE
| 0x1FFFF
| 0x2FFFE
| 0x2FFFF
| 0x3FFFE
| 0x3FFFF
| 0x4FFFE
| 0x4FFFF
| 0x5FFFE
| 0x5FFFF
| 0x6FFFE
| 0x6FFFF
| 0x7FFFE
| 0x7FFFF
| 0x8FFFE
| 0x8FFFF
| 0x9FFFE
| 0x9FFFF
| 0xAFFFE
| 0xAFFFF
| 0xBFFFE
| 0xBFFFF
| 0xCFFFE
| 0xCFFFF
| 0xDFFFE
| 0xDFFFF
| 0xEFFFE
| 0xEFFFF
| 0xFFFFE
| 0xFFFFF
| 0x10FFFE
| 0x10FFFF
)
}
fn is_outside_range<C: Into<u32>>(c: C) -> bool {
c.into() > 0x10FFFF
}
fn is_surrogate<C: Into<u32>>(c: C) -> bool {
(0xD800..=0xDFFF).contains(&c.into())
}
fn is_control<C: Into<u32>>(c: C) -> bool {
let c = c.into();
(0..=0x1F).contains(&c) || (0x7F..=0x9F).contains(&c)
}
fn is_ascii_whitespace<C: Into<u32>>(c: C) -> bool {
matches!(c.into(), 0x09 | 0x0A | 0x0C | 0x0D | 0x20)
}
fn correct_numeric_entity(number: u32) -> Option<Vec<u8>> {
#[inline]
fn char_to_vecu8(c: char) -> Option<Vec<u8>> {
Some(c.to_string().into())
}
#[inline]
fn u32_to_vecu8(c: u32) -> Option<Vec<u8>> {
Some(char::from_u32(c).unwrap().to_string().into())
}
match number {
0x00 => char_to_vecu8(REPLACEMENT_CHAR),
c if is_outside_range(c) => char_to_vecu8(REPLACEMENT_CHAR),
c if is_surrogate(c) => char_to_vecu8(REPLACEMENT_CHAR),
c if is_noncharacter(c) => None,
0x80 => u32_to_vecu8(0x20AC), 0x82 => u32_to_vecu8(0x201A), 0x83 => u32_to_vecu8(0x0192), 0x84 => u32_to_vecu8(0x201E), 0x85 => u32_to_vecu8(0x2026), 0x86 => u32_to_vecu8(0x2020), 0x87 => u32_to_vecu8(0x2021), 0x88 => u32_to_vecu8(0x02C6), 0x89 => u32_to_vecu8(0x2030), 0x8A => u32_to_vecu8(0x0160), 0x8B => u32_to_vecu8(0x2039), 0x8C => u32_to_vecu8(0x0152), 0x8E => u32_to_vecu8(0x017D), 0x91 => u32_to_vecu8(0x2018), 0x92 => u32_to_vecu8(0x2019), 0x93 => u32_to_vecu8(0x201C), 0x94 => u32_to_vecu8(0x201D), 0x95 => u32_to_vecu8(0x2022), 0x96 => u32_to_vecu8(0x2013), 0x97 => u32_to_vecu8(0x2014), 0x98 => u32_to_vecu8(0x02DC), 0x99 => u32_to_vecu8(0x2122), 0x9A => u32_to_vecu8(0x0161), 0x9B => u32_to_vecu8(0x203A), 0x9C => u32_to_vecu8(0x0153), 0x9E => u32_to_vecu8(0x017E), 0x9F => u32_to_vecu8(0x0178),
0x0D => None,
c if is_ascii_whitespace(c) => u32_to_vecu8(c),
c if is_control(c) => None,
c => match char::from_u32(c) {
Some(c) => char_to_vecu8(c),
None => None,
},
}
}
macro_rules! consumer {
($name:ident, $($accept:pat)|+) => {
fn $name<'a, I>(iter: &mut Peekable<I>) -> Vec<u8>
where I: Iterator<Item = &'a u8>
{
let mut buffer: Vec<u8> = Vec::new();
while let Some(c) = iter.peek() {
match **c {
$($accept)|+ => {
buffer.push(*iter.next().expect(PEEK_MATCH_ERROR));
},
_ => { return buffer; },
}
}
return buffer;
}
}
}
consumer!(consume_decimal, b'0'..=b'9');
consumer!(consume_hexadecimal, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F');
consumer!(consume_alphanumeric, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z');
fn match_entity<'a, I>(iter: &mut Peekable<I>) -> Vec<u8>
where
I: Iterator<Item = &'a u8>,
{
if let Some(&b'#') = iter.peek() {
return match_numeric_entity(iter);
}
let mut candidate = vec![b'&'];
candidate.append(&mut consume_alphanumeric(iter));
if let Some(&b';') = iter.peek() {
candidate.push(*iter.next().expect(PEEK_MATCH_ERROR));
}
if candidate.len() < ENTITY_MIN_LENGTH {
return candidate;
}
let max_len = min(candidate.len(), ENTITY_MAX_LENGTH);
for check_len in (ENTITY_MIN_LENGTH..=max_len).rev() {
if let Some(expansion) = ENTITIES.get(&candidate[..check_len]) {
let mut result = Vec::with_capacity(expansion.len() + candidate.len() - check_len);
result.extend_from_slice(expansion);
if check_len < candidate.len() {
result.extend_from_slice(&candidate[check_len..]);
}
return result;
}
}
candidate
}
#[cfg(test)]
mod tests {
use super::*;
test!(almost_entity, unescape("&time") == "&time");
test!(exact_no_semicolon, unescape("×") == "×");
test!(exact, unescape("×") == "×");
test!(entity_char, unescape("×a") == "×a");
test!(entity_char_is_prefix, unescape("×b") == "×b");
test!(exact_timesb, unescape("⊠") == "⊠");
test!(no_entities, unescape("none") == "none");
test!(only_ampersand, unescape("&") == "&");
test!(empty_entity, unescape("&;") == "&;");
test!(middle_entity, unescape(" & ") == " & ");
test!(extra_ampersands, unescape("&&&") == "&&&");
test!(two_entities, unescape("AND && and") == "AND && and");
test!(
long_entity,
unescape("&aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;")
== "&aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;"
);
test!(correct_hex_lowerx_lower, unescape("z") == "z");
test!(correct_hex_lowerx_upper, unescape("z") == "z");
test!(correct_hex_upperx_lower, unescape("z") == "z");
test!(correct_hex_upperx_upper, unescape("z") == "z");
test!(correct_dec, unescape("z") == "z");
test!(correct_hex_unicode, unescape("⇒") == "⇒");
test!(hex_no_semicolon, unescape("zz") == "zz");
test!(hex_no_semicolon_end, unescape("z") == "z");
test!(dec_no_semicolon, unescape("zz") == "zz");
test!(dec_no_semicolon_end, unescape("z") == "z");
test!(hex_instead_of_dec, unescape("a;") == "a;");
test!(invalid_hex_lowerx, unescape("&#xZ;") == "&#xZ;");
test!(invalid_hex_upperx, unescape("&#XZ;") == "&#XZ;");
test!(special_entity_null, unescape("�") == "\u{fffd}");
test!(special_entity_bullet, unescape("•") == "•");
test!(
special_entity_bullets,
unescape("••••") == "••••"
);
test!(special_entity_space, unescape(" ") == " ");
const ALL_SOURCE: &str = include_str!("../tests/corpus/all-entities-source.txt");
const ALL_EXPANDED: &str = include_str!("../tests/corpus/all-entities-expanded.txt");
test!(all_entities, unescape(ALL_SOURCE) == ALL_EXPANDED);
}