use std::{collections::HashMap, sync::LazyLock};
use regex::{Captures, Regex};
use crate::html::entities::HTML5_ENTITIES_REF;
static INVALID_CHAR: [(u32, &str); 34] = [
(0x00, "\u{fffd}"), (0x0d, "\r"), (0x80, "\u{20ac}"), (0x81, "\u{81}"), (0x82, "\u{201a}"), (0x83, "\u{0192}"), (0x84, "\u{201e}"), (0x85, "\u{2026}"), (0x86, "\u{2020}"), (0x87, "\u{2021}"), (0x88, "\u{02c6}"), (0x89, "\u{2030}"), (0x8a, "\u{0160}"), (0x8b, "\u{2039}"), (0x8c, "\u{0152}"), (0x8d, "\u{8d}"), (0x8e, "\u{017d}"), (0x8f, "\u{8f}"), (0x90, "\u{90}"), (0x91, "\u{2018}"), (0x92, "\u{2019}"), (0x93, "\u{201c}"), (0x94, "\u{201d}"), (0x95, "\u{2022}"), (0x96, "\u{2013}"), (0x97, "\u{2014}"), (0x98, "\u{02dc}"), (0x99, "\u{2122}"), (0x9a, "\u{0161}"), (0x9b, "\u{203a}"), (0x9c, "\u{0153}"), (0x9d, "\u{9d}"), (0x9e, "\u{017e}"), (0x9f, "\u{0178}"), ];
static INVALID_CHAR_REF: LazyLock<HashMap<u32, &'static str>> =
LazyLock::new(|| INVALID_CHAR.iter().copied().collect());
static INVALID_CODEPOINTS: [u32; 126] = [
0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
0x1e, 0x1f, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e,
0x9f, 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8, 0xfdd9, 0xfdda, 0xfddb,
0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7,
0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 0x4fffe, 0x4ffff,
0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff,
0xafffe, 0xaffff, 0xbfffe, 0xbffff, 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff,
0xffffe, 0xfffff, 0x10fffe, 0x10ffff,
];
static CHAR_REF: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(concat!(
r"&(#\d+;?",
r"|#[xX][\da-fA-F]+;?",
r"|[^\t\n\f <&#;]{1,32};?)",
))
.unwrap()
});
pub fn html_unescape(text: &str) -> String {
if text.chars().any(|c| c == '&') {
CHAR_REF
.replace_all(text, |caps: &Captures| {
let s = &caps[1];
let s0 = s.chars().next().unwrap();
if s0 == '#' {
let s1 = s.chars().nth(1).unwrap();
let num = if s1 == 'x' || s1 == 'X' {
let val = s[2..].trim_end_matches(';');
match u32::from_str_radix(val, 16) {
Ok(val) => val,
Err(_) => return "\u{FFFD}".to_string(),
}
} else {
let val = s[1..].trim_end_matches(';');
match val.parse::<u32>() {
Ok(val) => val,
Err(_) => return "\u{FFFD}".to_string(),
}
};
if let Some(char) = INVALID_CHAR_REF.get(&num) {
return char.to_string();
}
if (0xD800..=0xDFFF).contains(&num) || num > 0x10FFFF {
return "\u{FFFD}".to_string();
}
if INVALID_CODEPOINTS.contains(&num) {
return String::new();
}
char::from_u32(num).unwrap().to_string()
} else {
if let Some(entity) = HTML5_ENTITIES_REF.get(s) {
return entity.to_string();
}
for x in (1..s.len()).rev() {
let name = &s[..x];
if let Some(entity) = HTML5_ENTITIES_REF.get(name) {
return format!("{}{}", entity, &s[x..]);
}
}
format!("&{s}")
}
})
.to_string()
} else {
text.to_string()
}
}
#[cfg(test)]
mod tests {
use super::html_unescape;
#[test]
fn test_html_unescape() {
fn check(text: &str, expected: &str) {
assert_eq!(html_unescape(text), expected.to_string());
}
fn check_num(num: usize, expected: &str) {
let text = format!("&#{num}");
check(&text, expected);
let text = format!("&#{num};");
check(&text, expected);
let text = format!("&#x{num:x}");
check(&text, expected);
let text = format!("&#x{num:x};");
check(&text, expected);
}
check("Hurl⇄", "Hurl⇄");
check(
"Foo © bar 𝌆 baz ☃ qux",
"Foo © bar 𝌆 baz ☃ qux",
);
check("no character references", "no character references");
check("&\n&\t& &&", "&\n&\t& &&");
check("&0 &9 &a &0; &9; &a;", "&0 &9 &a &0; &9; &a;");
for x in ["&", "&#", "&#x", "&#X", "&#y", "&#xy", "&#Xy"].iter() {
check(x, x);
check(&format!("{x};"), &format!("{x};"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{num}"), &format!("{char}"));
check(&format!("&#{num} "), &format!("{char} "));
check(&format!("&#{num}X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{num:07}"), &format!("{char}"));
check(&format!("&#{num:07} "), &format!("{char} "));
check(&format!("&#{num:07}X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{num};"), &format!("{char}"));
check(&format!("&#{num}; "), &format!("{char} "));
check(&format!("&#{num};X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{num:07};"), &format!("{char}"));
check(&format!("&#{num:07}; "), &format!("{char} "));
check(&format!("&#{num:07};X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:x}"), &format!("{char}"));
check(&format!("&#x{num:x} "), &format!("{char} "));
check(&format!("&#x{num:x}X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:06x}"), &format!("{char}"));
check(&format!("&#x{num:06x} "), &format!("{char} "));
check(&format!("&#x{num:06x}X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:x};"), &format!("{char}"));
check(&format!("&#x{num:x}; "), &format!("{char} "));
check(&format!("&#x{num:x};X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:06x};"), &format!("{char}"));
check(&format!("&#x{num:06x}; "), &format!("{char} "));
check(&format!("&#x{num:06x};X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:X}"), &format!("{char}"));
check(&format!("&#x{num:X} "), &format!("{char} "));
check(&format!("&#x{num:X}X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:06X}"), &format!("{char}"));
check(&format!("&#x{num:06X} "), &format!("{char} "));
check(&format!("&#x{num:06X}X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:X};"), &format!("{char}"));
check(&format!("&#x{num:X}; "), &format!("{char} "));
check(&format!("&#x{num:X};X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{num:06X};"), &format!("{char}"));
check(&format!("&#x{num:06X}; "), &format!("{char} "));
check(&format!("&#x{num:06X};X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#X{num:x};"), &format!("{char}"));
check(&format!("&#X{num:x}; "), &format!("{char} "));
check(&format!("&#X{num:x};X"), &format!("{char}X"));
}
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#X{num:06x};"), &format!("{char}"));
check(&format!("&#X{num:06x}; "), &format!("{char} "));
check(&format!("&#X{num:06x};X"), &format!("{char}X"));
}
for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000] {
check_num(cp, "\u{FFFD}");
}
for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff] {
check_num(cp, "");
}
for (num, ch) in [(0x0d, "\r"), (0x80, "\u{20ac}"), (0x95, "\u{2022}")] {
check_num(num, ch);
}
check_num(0, "\u{FFFD}");
check_num(9, "\t");
check_num(1000000000000000000, "\u{FFFD}");
for e in ["";", "";", "";", "";"] {
check(e, "\";");
}
for e in [""quot;", ""quot;", ""quot;", ""quot;"] {
check(e, "\"quot;");
}
for e in [""", """, """, """] {
check(&format!("{e};").repeat(3), "\"\"\"");
}
for e in ["&", "&", "&", "&"] {
check(e, "&");
}
for e in ["&Amp", "&Amp;"] {
check(e, e);
}
check("&svadilfari;", "&svadilfari;");
check("¬it", "¬it");
check("¬it;", "¬it;");
check("¬in", "¬in");
check("∉", "∉");
check(
"¬ReallyAnExistingNamedCharacterReference;",
"¬ReallyAnExistingNamedCharacterReference;",
);
check("∳", "∳");
check("∾̳", "\u{223e}\u{333}");
check("&acE", "&acE");
check(&"{ ".repeat(1050), &"{ ".repeat(1050));
check(
"ÉricÉric&alphacentauriαcentauri",
"ÉricÉric&alphacentauriαcentauri",
);
check("&co;", "&co;");
}
}