use memchr::memchr;
use minify_html_common::gen::codepoints::Lookup;
use minify_html_common::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
use minify_html_common::gen::codepoints::DIGIT;
use minify_html_common::gen::codepoints::HEX_DIGIT;
use minify_html_common::gen::codepoints::LOWER_HEX_ALPHA;
use minify_html_common::gen::codepoints::UPPER_HEX_ALPHA;
use minify_html_common::gen::entities::EntityType;
use minify_html_common::gen::entities::ENTITY;
use minify_html_common::pattern::TrieNodeMatch;
use std::char::from_u32;
enum Decoded {
Ignored,
Named(&'static [u8]),
Numeric(char),
}
struct ParsedEntity {
decoded: Decoded,
read_len: usize,
}
fn parse_numeric_entity(
code: &[u8],
read_start: usize,
digit_lookup: &'static Lookup,
on_digit: fn(u32, u8) -> u32,
max_digits: usize,
) -> ParsedEntity {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start;
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
}
loop {
match code.get(read_next) {
Some(&c) if digit_lookup[c] => {
value = on_digit(value, c);
read_next += 1;
digits += 1;
}
_ => break,
};
}
if let Some(b';') = code.get(read_next) {
read_next += 1;
};
let char = Some(value)
.filter(|_| digits <= max_digits)
.and_then(from_u32)
.unwrap_or('\u{FFFD}');
ParsedEntity {
read_len: read_next,
decoded: Decoded::Numeric(char),
}
}
fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
match ENTITY.longest_matching_prefix(code) {
TrieNodeMatch::NotFound { reached } => ParsedEntity {
read_len: reached,
decoded: Decoded::Ignored,
},
TrieNodeMatch::Found {
len: match_len,
value,
} => match value {
EntityType::Dec => parse_numeric_entity(
code,
2,
DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
3,
HEX_DIGIT,
|value, c| {
value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
})
},
6,
),
EntityType::Named(decoded) => {
if in_attr_val
&& code[match_len - 1] != b';'
&& code
.get(match_len)
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
.is_some()
{
ParsedEntity {
read_len: match_len,
decoded: Decoded::Ignored,
}
} else {
ParsedEntity {
read_len: match_len,
decoded: Decoded::Named(decoded),
}
}
}
},
}
}
pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
let mut res = Vec::<u8>::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
None => (code.len(), false),
Some(n) => (n, true),
};
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
match decoded {
Decoded::Numeric(c) => {
let mut buf = [0u8; 4];
let encoded = c.encode_utf8(&mut buf);
res.extend_from_slice(encoded.as_bytes());
}
Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
Decoded::Named(s) => res.extend_from_slice(s),
};
code = &code[read_len..];
};
}
res
}