use crate::proc::Processor;
use minify_html_common::gen::codepoints::Lookup;
use minify_html_common::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
use minify_html_common::gen::codepoints::DIGIT;
use minify_html_common::gen::codepoints::HEX_DIGIT;
use minify_html_common::gen::codepoints::LOWER_HEX_ALPHA;
use minify_html_common::gen::codepoints::UPPER_HEX_ALPHA;
use minify_html_common::gen::entities::EntityType;
use minify_html_common::gen::entities::ENTITY;
use minify_html_common::pattern::TrieNodeMatch;
use std::char::from_u32;
enum Parsed {
Decoded { read_len: usize, write_len: usize },
LeftEncoded,
Invalid { len: usize },
}
#[inline(always)]
fn parse_numeric_entity(
code: &mut [u8],
read_start: usize,
prefix_len: usize,
write_pos: usize,
digit_lookup: &'static Lookup,
on_digit: fn(u32, u8) -> u32,
max_digits: usize,
) -> Parsed {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start + prefix_len;
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
}
loop {
match code.get(read_next) {
Some(&c) if digit_lookup[c] => {
value = on_digit(value, c);
read_next += 1;
digits += 1;
}
_ => break,
};
}
if let Some(b';') = code.get(read_next) {
read_next += 1;
};
let char = Some(value)
.filter(|_| digits <= max_digits)
.and_then(from_u32)
.unwrap_or('\u{FFFD}');
Parsed::Decoded {
read_len: read_next - read_start,
write_len: char.encode_utf8(&mut code[write_pos..]).len(),
}
}
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found {
len: match_len,
value,
} => match value {
EntityType::Dec => parse_numeric_entity(
code,
read_pos,
2,
write_pos,
DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
read_pos,
3,
write_pos,
HEX_DIGIT,
|value, c| {
value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
})
},
6,
),
EntityType::Named(decoded) => {
if decoded.len() > match_len
|| in_attr_val
&& *code.get(read_pos + match_len - 1).unwrap() != b';'
&& code
.get(read_pos + match_len)
.filter(|c| ALPHANUMERIC_OR_EQUALS[**c])
.is_some()
{
Parsed::LeftEncoded
} else {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
Parsed::Decoded {
read_len: match_len,
write_len: decoded.len(),
}
}
}
},
TrieNodeMatch::NotFound { reached } => Parsed::Invalid { len: reached },
}
}
pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
if proc.peek(0).filter(|c| *c == b'&').is_none() {
return false;
};
let start = proc.read_next;
let mut read_next = start;
let mut write_next = start;
let mut node = ENTITY;
while node.value.is_none() {
match proc.code.get(read_next) {
None => break,
Some(b'&') => {
let (read_len, write_len) =
match parse_entity(proc.code, read_next, write_next, in_attr_val) {
Parsed::LeftEncoded => {
break;
}
Parsed::Decoded {
read_len,
write_len,
} => {
debug_assert!(read_len > 0);
debug_assert!(write_len > 0);
(read_len, write_len)
}
Parsed::Invalid { len } => {
debug_assert!(len > 0);
if read_next != start {
break;
};
proc
.code
.copy_within(read_next..read_next + len, write_next);
(len, len)
}
};
debug_assert!(read_len > 0);
let (new_node, match_len) =
node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
node = new_node;
read_next += read_len;
write_next += write_len;
if match_len < write_len {
break;
};
}
Some(_) => {
let (new_node, new_read_next) = node.shortest_matching_prefix(proc.code, read_next);
let len = new_read_next - read_next;
if len == 0 {
break;
};
proc.code.copy_within(read_next..new_read_next, write_next);
read_next += len;
write_next += len;
node = new_node;
}
};
}
let undecodable = node.value.is_some();
let mut shifted_start = read_next - (write_next - start - undecodable as usize);
proc
.code
.copy_within(start + undecodable as usize..write_next, shifted_start);
if undecodable {
debug_assert_eq!(proc.code.get(start), Some(&b'&'));
proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&");
shifted_start -= 4;
};
proc.read_next = shifted_start;
true
}