use std::borrow::Cow;
use std::char;
use std::num::IntErrorKind;
use std::result::Result;
use std::slice;
pub fn unescape_in<'a, M: Matcher, S: Into<Cow<'a, str>>>(
_matcher: M,
escaped: S,
) -> Cow<'a, str> {
let escaped = escaped.into();
let bytes = escaped.as_bytes();
match unescape_in_internal::<M>(bytes) {
Some(buffer) => String::from_utf8(buffer).unwrap().into(),
None => escaped,
}
}
pub fn unescape_bytes_in<'a, M: Matcher, S: Into<Cow<'a, [u8]>>>(
_matcher: M,
escaped: S,
) -> Cow<'a, [u8]> {
let escaped = escaped.into();
match unescape_in_internal::<M>(&escaped) {
Some(buffer) => buffer.into(),
None => escaped,
}
}
fn unescape_in_internal<M: Matcher>(escaped: &[u8]) -> Option<Vec<u8>> {
let mut amp_iter = memchr::memchr_iter(b'&', escaped);
while let Some(i) = amp_iter.next() {
let mut byte_iter = escaped[i..].iter();
if let Some(expansion) = M::match_entity(&mut byte_iter) {
let mut buffer = Vec::with_capacity(escaped.len());
buffer.extend_from_slice(&escaped[..i]);
buffer.extend_from_slice(&expansion);
#[allow(
clippy::arithmetic_side_effects,
reason = "byte_iter.as_slice().len() has to be < escaped.len()"
)]
let mut last_end = escaped.len() - byte_iter.as_slice().len();
for i in amp_iter {
let mut byte_iter = escaped[i..].iter();
#[allow(
clippy::arithmetic_side_effects,
reason = "byte_iter.as_slice().len() has to be < escaped.len()"
)]
if let Some(expansion) = M::match_entity(&mut byte_iter) {
buffer.extend_from_slice(&escaped[last_end..i]);
buffer.extend_from_slice(&expansion);
last_end = escaped.len() - byte_iter.as_slice().len();
}
}
buffer.extend_from_slice(&escaped[last_end..]);
return Some(buffer);
}
}
None
}
#[cfg(feature = "unescape")]
pub struct Phf;
#[cfg(feature = "unescape_fast")]
pub struct Matchgen;
#[derive(Clone, Copy, Debug)]
pub struct ContextAttribute;
pub struct ContextGeneral;
pub trait Matcher {
fn match_entity<'a>(iter: &'a mut slice::Iter<u8>)
-> Option<Cow<'a, [u8]>>;
}
#[cfg(feature = "unescape_fast")]
include!(concat!(env!("OUT_DIR"), "/matcher.rs"));
#[cfg(feature = "unescape_fast")]
impl Matcher for (Matchgen, ContextAttribute) {
fn match_entity<'a>(
iter: &'a mut slice::Iter<u8>,
) -> Option<Cow<'a, [u8]>> {
assert_peek_eq(iter, Some(b'&'), "match_entity() expected '&'");
if Some(b'#') == peek_n(iter, 1) {
return match_numeric_entity(iter);
}
let slice = iter.as_slice();
let (expansion, rest) = entity_matcher(slice);
#[allow(
clippy::arithmetic_side_effects,
reason = "rest is a subslice of slice"
)]
let consumed = slice.len() - rest.len();
if consumed > 0 {
#[allow(clippy::arithmetic_side_effects, reason = "checked")]
iter.nth(consumed - 1); }
if let Some((closed, expansion)) = expansion {
if !closed {
if let Some(next) = peek(iter) {
if next == b'=' || next.is_ascii_alphanumeric() {
return None;
}
}
}
Some(expansion.into())
} else {
iter.next();
None
}
}
}
#[cfg(feature = "unescape_fast")]
impl Matcher for (Matchgen, ContextGeneral) {
fn match_entity<'a>(
iter: &'a mut slice::Iter<u8>,
) -> Option<Cow<'a, [u8]>> {
assert_peek_eq(iter, Some(b'&'), "match_entity() expected '&'");
if Some(b'#') == peek_n(iter, 1) {
return match_numeric_entity(iter);
}
let slice = iter.as_slice();
let (expansion, rest) = entity_matcher(slice);
#[allow(
clippy::arithmetic_side_effects,
reason = "rest is a subslice of slice"
)]
let consumed = slice.len() - rest.len();
if consumed > 0 {
#[allow(clippy::arithmetic_side_effects, reason = "checked")]
iter.nth(consumed - 1); }
expansion
.map(|(_, expansion)| expansion.into())
.or_else(|| {
iter.next();
None
})
}
}
const PEEK_MATCH_ERROR: &str = "iter.next() did not match previous peek(iter)";
#[cfg(feature = "unescape")]
impl Matcher for (Phf, ContextAttribute) {
fn match_entity<'a>(
iter: &'a mut slice::Iter<u8>,
) -> Option<Cow<'a, [u8]>> {
use crate::{ENTITIES, ENTITY_MIN_LENGTH};
assert_peek_eq(iter, Some(b'&'), "match_entity() expected '&'");
if Some(b'#') == peek_n(iter, 1) {
return match_numeric_entity(iter);
}
let raw = &iter.as_slice();
find_longest_candidate(iter);
match peek(iter) {
Some(b';') => {
assert_next_eq(iter, Some(b';'), PEEK_MATCH_ERROR);
}
Some(b'=') => {
return None;
}
_ => {
}
}
debug_assert!(raw.len() >= iter.as_slice().len());
#[allow(clippy::arithmetic_side_effects)]
let candidate = &raw[..raw.len() - iter.as_slice().len()];
if candidate.len() < ENTITY_MIN_LENGTH {
return None;
}
ENTITIES.get(candidate).map(|&expansion| expansion.into())
}
}
#[cfg(feature = "unescape")]
impl Matcher for (Phf, ContextGeneral) {
fn match_entity<'a>(
iter: &'a mut slice::Iter<u8>,
) -> Option<Cow<'a, [u8]>> {
use crate::{BARE_ENTITY_MAX_LENGTH, ENTITIES, ENTITY_MIN_LENGTH};
use std::cmp::min;
assert_peek_eq(iter, Some(b'&'), "match_entity() expected '&'");
if Some(b'#') == peek_n(iter, 1) {
return match_numeric_entity(iter);
}
let raw = &iter.as_slice();
let original_iter = iter.clone();
find_longest_candidate(iter);
let has_semicolon = peek(iter) == Some(b';');
if has_semicolon {
assert_next_eq(iter, Some(b';'), PEEK_MATCH_ERROR);
} else {
}
debug_assert!(raw.len() >= iter.as_slice().len());
#[allow(clippy::arithmetic_side_effects)]
let candidate = &raw[..raw.len() - iter.as_slice().len()];
if candidate.len() < ENTITY_MIN_LENGTH {
return None;
}
if has_semicolon {
#[allow(clippy::len_zero, reason = "clarity")]
if let Some(&expansion) = ENTITIES.get(candidate) {
*iter = original_iter;
debug_assert!(candidate.len() >= 1);
#[allow(clippy::arithmetic_side_effects)]
iter.nth(candidate.len() - 1); return Some(expansion.into());
}
}
for check_len in
ENTITY_MIN_LENGTH..=min(candidate.len(), BARE_ENTITY_MAX_LENGTH)
{
if let Some(&expansion) = ENTITIES.get(&candidate[..check_len]) {
*iter = original_iter;
debug_assert!(check_len >= 1);
#[allow(clippy::arithmetic_side_effects)]
iter.nth(check_len - 1); return Some(expansion.into());
}
}
None
}
}
#[allow(clippy::from_str_radix_10)]
fn match_numeric_entity(
iter: &mut slice::Iter<u8>,
) -> Option<Cow<'static, [u8]>> {
assert_next_eq(iter, Some(b'&'), "match_numeric_entity() expected '&'");
assert_next_eq(iter, Some(b'#'), "match_numeric_entity() expected '#'");
let number = match peek(iter) {
c @ Some(b'x' | b'X') => {
assert_next_eq(iter, c, PEEK_MATCH_ERROR);
let hex = slice_while(iter, u8::is_ascii_hexdigit);
u32::from_str_radix(core::str::from_utf8(hex).unwrap(), 16)
}
Some(_) => {
let dec = slice_while(iter, u8::is_ascii_digit);
u32::from_str_radix(core::str::from_utf8(dec).unwrap(), 10)
}
None => {
return None;
}
};
if Some(b';') == peek(iter) {
assert_next_eq(iter, Some(b';'), PEEK_MATCH_ERROR);
} else {
}
match number {
Ok(number) => {
return Some(correct_numeric_entity(number));
}
Err(error) => match error.kind() {
IntErrorKind::PosOverflow => {
return Some(REPLACEMENT_CHAR_BYTES.into());
}
IntErrorKind::Empty => {
}
_ => panic!("error parsing number in numeric entity: {error:?}"),
},
}
None
}
pub const REPLACEMENT_CHAR_BYTES: &[u8] = "\u{fffd}".as_bytes();
#[allow(clippy::match_same_arms)]
fn correct_numeric_entity(number: u32) -> Cow<'static, [u8]> {
match number {
0x00 => REPLACEMENT_CHAR_BYTES.into(),
0x11_0000.. => REPLACEMENT_CHAR_BYTES.into(),
0xD800..=0xDFFF => REPLACEMENT_CHAR_BYTES.into(),
0x80 => "\u{20AC}".as_bytes().into(), 0x82 => "\u{201A}".as_bytes().into(), 0x83 => "\u{0192}".as_bytes().into(), 0x84 => "\u{201E}".as_bytes().into(), 0x85 => "\u{2026}".as_bytes().into(), 0x86 => "\u{2020}".as_bytes().into(), 0x87 => "\u{2021}".as_bytes().into(), 0x88 => "\u{02C6}".as_bytes().into(), 0x89 => "\u{2030}".as_bytes().into(), 0x8A => "\u{0160}".as_bytes().into(), 0x8B => "\u{2039}".as_bytes().into(), 0x8C => "\u{0152}".as_bytes().into(), 0x8E => "\u{017D}".as_bytes().into(), 0x91 => "\u{2018}".as_bytes().into(), 0x92 => "\u{2019}".as_bytes().into(), 0x93 => "\u{201C}".as_bytes().into(), 0x94 => "\u{201D}".as_bytes().into(), 0x95 => "\u{2022}".as_bytes().into(), 0x96 => "\u{2013}".as_bytes().into(), 0x97 => "\u{2014}".as_bytes().into(), 0x98 => "\u{02DC}".as_bytes().into(), 0x99 => "\u{2122}".as_bytes().into(), 0x9A => "\u{0161}".as_bytes().into(), 0x9B => "\u{203A}".as_bytes().into(), 0x9C => "\u{0153}".as_bytes().into(), 0x9E => "\u{017E}".as_bytes().into(), 0x9F => "\u{0178}".as_bytes().into(),
c => char::from_u32(c)
.map(|c| c.to_string().into_bytes().into())
.unwrap_or_else(|| REPLACEMENT_CHAR_BYTES.into()),
}
}
#[cfg(feature = "unescape")]
fn find_longest_candidate(iter: &mut slice::Iter<u8>) {
use crate::ENTITY_MAX_LENGTH;
assert_next_eq(iter, Some(b'&'), PEEK_MATCH_ERROR);
for _ in 1..ENTITY_MAX_LENGTH {
if let Some(c) = peek(iter) {
if c.is_ascii_alphanumeric() {
iter.next();
continue;
}
}
break;
}
}
fn slice_while<'a, P>(
iter: &mut slice::Iter<'a, u8>,
mut predicate: P,
) -> &'a [u8]
where
P: FnMut(&u8) -> bool,
{
slice_until(iter, move |c| !predicate(c))
}
fn slice_until<'a, P>(iter: &mut slice::Iter<'a, u8>, predicate: P) -> &'a [u8]
where
P: FnMut(&u8) -> bool,
{
let remainder = iter.as_slice();
position_peek(iter, predicate)
.map(|i| &remainder[..i])
.unwrap_or(remainder)
}
fn assert_next_eq(iter: &mut slice::Iter<u8>, expected: Option<u8>, msg: &str) {
assert_eq!(iter.next().copied(), expected, "{msg}");
}
fn assert_peek_eq(iter: &slice::Iter<u8>, expected: Option<u8>, msg: &str) {
assert_eq!(peek(iter), expected, "{msg}");
}
fn peek(iter: &slice::Iter<u8>) -> Option<u8> {
peek_n(iter, 0)
}
fn peek_n(iter: &slice::Iter<u8>, n: usize) -> Option<u8> {
iter.as_slice().get(n).copied()
}
fn position_peek<P>(
iter: &mut slice::Iter<u8>,
mut predicate: P,
) -> Option<usize>
where
P: FnMut(&u8) -> bool,
{
try_fold_peek(iter, 0, move |i, x| {
if predicate(x) {
Err(i)
} else {
debug_assert!(i < usize::MAX);
#[allow(clippy::arithmetic_side_effects)]
Ok(i + 1)
}
})
.err()
}
fn try_fold_peek<T, F>(
iter: &mut slice::Iter<u8>,
initial: T,
mut function: F,
) -> Result<T, T>
where
F: FnMut(T, &u8) -> Result<T, T>,
{
let mut accumulator = initial;
for c in iter.as_slice() {
accumulator = function(accumulator, c)?;
iter.next();
}
Ok(accumulator)
}
#[cfg(test)]
mod tests {
use super::*;
use assert2::{assert, check};
use pastey::paste;
macro_rules! test {
($name:ident, unescape ($($input:tt)+) == $expected:expr) => {
paste! {
#[cfg(feature = "unescape_fast")]
#[test]
fn [<fast_ $name>]() {
assert!(unescape_in((Matchgen, ContextGeneral), $($input)+) == $expected);
}
#[cfg(feature = "unescape")]
#[test]
fn [<slow_ $name>]() {
assert!(unescape_in((Phf, ContextGeneral), $($input)+) == $expected);
}
}
};
($name:ident, unescape_attribute ($($input:tt)+) == $expected:expr) => {
paste! {
#[cfg(feature = "unescape_fast")]
#[test]
fn [<fast_ $name>]() {
assert!(unescape_in((Matchgen, ContextAttribute), $($input)+) == $expected);
}
#[cfg(feature = "unescape")]
#[test]
fn [<slow_ $name>]() {
assert!(unescape_in((Phf, ContextAttribute), $($input)+) == $expected);
}
}
};
}
macro_rules! test_both {
($name:ident, unescape ($input:expr) == $expected:expr) => {
paste! {
test!($name, unescape($input) == $expected);
test!([<attribute_ $name>], unescape_attribute($input) == $expected);
}
};
}
test_both!(almost_entity, unescape("&time") == "&time");
test_both!(exact_times, unescape("×") == "×");
test_both!(exact_timesb, unescape("⊠") == "⊠");
test_both!(bare_times_end, unescape("×") == "×");
test_both!(bare_times_bang, unescape("×!") == "×!");
test!(bare_entity_char, unescape("×a") == "×a");
test!(other_entity, unescape("⨱") == "⨱"); test!(bare_entity_almost_other, unescape("×bar") == "×bar");
test!(
bare_entity_long_suffix,
unescape("×barrrrrr") == "×barrrrrr"
);
test!(bare_entity_equal, unescape("×=") == "×=");
test!(bare_entity_char_semicolon, unescape("×a;") == "×a;");
test!(bare_entity_equal_semicolon, unescape("×=;") == "×=;");
test_both!(bare_entity_entity, unescape("×<") == "×<");
test!(bare_entity_char_is_prefix, unescape("×b") == "×b");
test!(
bare_entity_char_is_prefix_entity,
unescape("×b<") == "×b<"
);
test!(
attribute_bare_entity_char,
unescape_attribute("×a") == "×a"
);
test!(
attribute_bare_entity_equal,
unescape_attribute("×=") == "×="
);
test!(
attribute_bare_entity_char_semicolon,
unescape_attribute("×a;") == "×a;"
);
test!(
attribute_bare_entity_equal_semicolon,
unescape_attribute("×=;") == "×=;"
);
test!(
attribute_bare_entity_char_is_prefix,
unescape_attribute("×b") == "×b"
);
test!(
attribute_bare_entity_char_is_prefix_entity,
unescape_attribute("×b<") == "×b<"
);
test_both!(empty, unescape("") == "");
test_both!(no_entities, unescape("none") == "none");
test_both!(only_ampersand, unescape("&") == "&");
test_both!(empty_entity, unescape("&;") == "&;");
test_both!(invalid_entity, unescape("&time;") == "&time;");
test_both!(middle_invalid_entity, unescape(" &time; ") == " &time; ");
test_both!(
mixed_valid_invalid_entities,
unescape("&time; & &time; & &time;")
== "&time; & &time; & &time;"
);
test_both!(middle_entity, unescape(" & ") == " & ");
test_both!(extra_ampersands, unescape("&&&") == "&&&");
test_both!(two_entities, unescape("AND && and") == "AND && and");
test_both!(
long_valid_entity,
unescape("∳") == "∳"
);
test_both!(
long_invalid_entity,
unescape("&CounterClockwiseContourIntegralX;")
== "&CounterClockwiseContourIntegralX;"
);
test_both!(
very_long_invalid_entity,
unescape("&aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;")
== "&aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;"
);
test_both!(correct_hex_lowerx_lower, unescape("z") == "z");
test_both!(correct_hex_lowerx_upper, unescape("z") == "z");
test_both!(correct_hex_upperx_lower, unescape("z") == "z");
test_both!(correct_hex_upperx_upper, unescape("z") == "z");
test_both!(correct_hex_leading_zero, unescape("z") == "z");
test_both!(correct_hex_leading_zero_zero, unescape("z") == "z");
test_both!(correct_dec, unescape("z") == "z");
test_both!(correct_dec_leading_zero, unescape("z") == "z");
test_both!(correct_dec_leading_zero_zero, unescape("z") == "z");
test_both!(correct_hex_unicode, unescape("⇒") == "⇒");
test_both!(bare_hex_char, unescape("zz") == "zz");
test_both!(bare_hex_entity, unescape("z<") == "z<");
test_both!(bare_hex_end, unescape("z") == "z");
test_both!(bare_dec_char, unescape("zz") == "zz");
test_both!(bare_dec_entity, unescape("z<") == "z<");
test_both!(bare_dec_end, unescape("z") == "z");
test_both!(bare_empty_numeric_char, unescape("&#z") == "&#z");
test_both!(bare_empty_numeric_entity, unescape("&#<") == "&#<");
test_both!(bare_empty_numeric_end, unescape("&#") == "&#");
test_both!(hex_instead_of_dec, unescape("&#a0;") == "&#a0;");
test_both!(invalid_hex_lowerx, unescape("&#xZ;") == "&#xZ;");
test_both!(invalid_hex_upperx, unescape("&#XZ;") == "&#XZ;");
test_both!(hex_control_1, unescape("") == "\u{1}");
test_both!(dec_control_1, unescape("") == "\u{1}");
test_both!(dec_cr, unescape(" ") == "\r");
test_both!(hex_cr, unescape("
") == "\r");
test_both!(hex_tab, unescape("	") == "\t");
test_both!(dec_tab, unescape("	") == "\t");
test_both!(hex_max_code_point, unescape("") == "\u{10ffff}");
test_both!(
hex_above_max_code_point,
unescape("�") == "\u{fffd}"
);
test_both!(hex_11_chars, unescape("�") == "\u{fffd}");
test_both!(
bare_hex_11_chars_end,
unescape("�") == "\u{fffd}"
);
test_both!(
hex_40_chars,
unescape("�") == "\u{fffd}"
);
test_both!(
bare_hex_40_chars_end,
unescape("�") == "\u{fffd}"
);
test_both!(special_entity_null, unescape("�") == "\u{fffd}");
test_both!(special_entity_bullet, unescape("•") == "•");
test_both!(
special_entity_bullets,
unescape("••••") == "••••"
);
test_both!(special_entity_space, unescape(" ") == " ");
const ALL_SOURCE: &str =
include_str!("../../tests/corpus/all-entities-source.txt");
const ALL_EXPANDED: &str =
include_str!("../../tests/corpus/all-entities-expanded.txt");
test_both!(all_entities, unescape(ALL_SOURCE) == ALL_EXPANDED);
#[cfg(feature = "unescape_fast")]
#[test]
fn fast_invalid_utf8() {
assert!(
unescape_bytes_in((Matchgen, ContextGeneral), &b"\xa1"[..])
== &b"\xa1"[..]
);
}
#[cfg(feature = "unescape")]
#[test]
fn slow_invalid_utf8() {
assert!(
unescape_bytes_in((Phf, ContextGeneral), &b"\xa1"[..])
== &b"\xa1"[..]
);
}
#[cfg(feature = "unescape_fast")]
#[test]
fn fast_attribute_invalid_utf8() {
assert!(
unescape_bytes_in((Matchgen, ContextAttribute), &b"\xa1"[..])
== &b"\xa1"[..]
);
}
#[cfg(feature = "unescape")]
#[test]
fn slow_attribute_invalid_utf8() {
assert!(
unescape_bytes_in((Phf, ContextAttribute), &b"\xa1"[..])
== &b"\xa1"[..]
);
}
#[test]
fn correct_numeric_entity_euro() {
match correct_numeric_entity(0x80) {
Cow::Borrowed(s) => assert!(s == "\u{20AC}".as_bytes()),
Cow::Owned(_) => panic!("expected borrowed"),
}
}
#[test]
fn correct_numeric_entity_null() {
match correct_numeric_entity(0) {
Cow::Borrowed(s) => assert!(s == "\u{fffd}".as_bytes()),
Cow::Owned(_) => panic!("expected borrowed"),
}
}
#[test]
fn correct_numeric_entity_z() {
match correct_numeric_entity(b'z'.into()) {
Cow::Borrowed(_) => panic!("expected owned"),
Cow::Owned(ref s) => assert!(s == b"z"),
}
}
#[test]
fn bare_entity_prefix_rule() {
let all_bare: Vec<_> = ALL_SOURCE
.split_ascii_whitespace()
.filter(|entity| entity.ends_with(';'))
.collect();
for bare in &all_bare {
check!(
all_bare
.iter()
.find(|other| other.starts_with(bare) && *other != bare)
== None,
"No bare entity may be a prefix for another bare entity"
);
}
}
}