minify_html_onepass/proc/
entity.rs1use crate::proc::Processor;
17use minify_html_common::gen::codepoints::Lookup;
18use minify_html_common::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
19use minify_html_common::gen::codepoints::DIGIT;
20use minify_html_common::gen::codepoints::HEX_DIGIT;
21use minify_html_common::gen::codepoints::LOWER_HEX_ALPHA;
22use minify_html_common::gen::codepoints::UPPER_HEX_ALPHA;
23use minify_html_common::gen::entities::EntityType;
24use minify_html_common::gen::entities::ENTITY;
25use minify_html_common::pattern::TrieNodeMatch;
26use std::char::from_u32;
27
28enum Parsed {
29 Decoded { read_len: usize, write_len: usize },
31 LeftEncoded,
35 Invalid { len: usize },
37}
38
39#[inline(always)]
40fn parse_numeric_entity(
41 code: &mut [u8],
42 read_start: usize,
43 prefix_len: usize,
44 write_pos: usize,
45 digit_lookup: &'static Lookup,
46 on_digit: fn(u32, u8) -> u32,
47 max_digits: usize,
48) -> Parsed {
49 let mut value = 0u32;
50 let mut digits = 0;
51 let mut read_next = read_start + prefix_len;
52 while code.get(read_next).filter(|c| **c == b'0').is_some() {
54 read_next += 1;
55 }
56 loop {
58 match code.get(read_next) {
59 Some(&c) if digit_lookup[c] => {
60 value = on_digit(value, c);
62 read_next += 1;
63 digits += 1;
64 }
65 _ => break,
66 };
67 }
68 if let Some(b';') = code.get(read_next) {
70 read_next += 1;
71 };
72 let char = Some(value)
74 .filter(|_| digits <= max_digits)
75 .and_then(from_u32)
76 .unwrap_or('\u{FFFD}');
77 Parsed::Decoded {
78 read_len: read_next - read_start,
79 write_len: char.encode_utf8(&mut code[write_pos..]).len(),
80 }
81}
82
83fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
86 match ENTITY.longest_matching_prefix(&code[read_pos..]) {
87 TrieNodeMatch::Found {
88 len: match_len,
89 value,
90 } => match value {
91 EntityType::Dec => parse_numeric_entity(
92 code,
93 read_pos,
94 2,
96 write_pos,
97 DIGIT,
98 |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
99 7,
100 ),
101 EntityType::Hex => parse_numeric_entity(
102 code,
103 read_pos,
104 3,
106 write_pos,
107 HEX_DIGIT,
108 |value, c| {
109 value.wrapping_mul(16).wrapping_add(match c {
110 c if DIGIT[c] => (c - b'0') as u32,
111 c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
112 c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
113 _ => unreachable!(),
114 })
115 },
116 6,
117 ),
118 EntityType::Named(decoded) => {
119 if decoded.len() > match_len
121 || in_attr_val
122 && *code.get(read_pos + match_len - 1).unwrap() != b';'
123 && code
124 .get(read_pos + match_len)
125 .filter(|c| ALPHANUMERIC_OR_EQUALS[**c])
126 .is_some()
127 {
128 Parsed::LeftEncoded
129 } else {
130 code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
131 Parsed::Decoded {
132 read_len: match_len,
133 write_len: decoded.len(),
134 }
135 }
136 }
137 },
138 TrieNodeMatch::NotFound { reached } => Parsed::Invalid { len: reached },
140 }
141}
142
143pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
146 if proc.peek(0).filter(|c| *c == b'&').is_none() {
147 return false;
148 };
149
150 let start = proc.read_next;
151
152 let mut read_next = start;
156 let mut write_next = start;
157 let mut node = ENTITY;
158 while node.value.is_none() {
159 match proc.code.get(read_next) {
160 None => break,
161 Some(b'&') => {
162 let (read_len, write_len) =
164 match parse_entity(proc.code, read_next, write_next, in_attr_val) {
165 Parsed::LeftEncoded => {
166 break;
168 }
169 Parsed::Decoded {
170 read_len,
171 write_len,
172 } => {
173 debug_assert!(read_len > 0);
174 debug_assert!(write_len > 0);
175 (read_len, write_len)
176 }
177 Parsed::Invalid { len } => {
178 debug_assert!(len > 0);
179 if read_next != start {
188 break;
189 };
190 proc
191 .code
192 .copy_within(read_next..read_next + len, write_next);
193 (len, len)
194 }
195 };
196 debug_assert!(read_len > 0);
197
198 let (new_node, match_len) =
199 node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
200 node = new_node;
201 read_next += read_len;
202 write_next += write_len;
203 if match_len < write_len {
204 break;
207 };
208 }
209 Some(_) => {
210 let (new_node, new_read_next) = node.shortest_matching_prefix(proc.code, read_next);
211 let len = new_read_next - read_next;
212 if len == 0 {
213 break;
214 };
215 proc.code.copy_within(read_next..new_read_next, write_next);
216 read_next += len;
217 write_next += len;
218 node = new_node;
219 }
220 };
221 }
222 let undecodable = node.value.is_some();
224 let mut shifted_start = read_next - (write_next - start - undecodable as usize);
226 proc
227 .code
228 .copy_within(start + undecodable as usize..write_next, shifted_start);
229 if undecodable {
230 debug_assert_eq!(proc.code.get(start), Some(&b'&'));
231 proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&");
232 shifted_start -= 4;
233 };
234
235 proc.read_next = shifted_start;
236 true
237}