html_escape/decode/html_entity/
mod.rs1mod tables;
2
3pub use tables::*;
4
5use core::convert::TryFrom;
6use core::str::from_utf8_unchecked;
7
8use alloc::borrow::Cow;
9use alloc::string::String;
10use alloc::vec::Vec;
11
12#[cfg(feature = "std")]
13use std::io::{self, Write};
14
15use crate::functions::*;
16
17pub fn decode_html_entities<S: ?Sized + AsRef<str>>(text: &S) -> Cow<str> {
19 let text = text.as_ref();
20 let text_bytes = text.as_bytes();
21 let text_length = text_bytes.len();
22
23 let mut p = 0;
24 let mut ep = 0;
25 let mut e;
26
27 let mut step = 0;
28
29 let (mut v, mut start) = loop {
30 if p == text_length {
31 return Cow::from(text);
32 }
33
34 e = text_bytes[p];
35
36 match step {
37 0 => {
38 if e == b'&' {
39 step = 1;
40 ep = p;
41 }
42 }
43 1 => {
44 match e {
45 b'#' => {
46 step = 3;
47 }
48 b';' => {
49 step = 0;
51 }
52 _ => {
53 step = 2;
54 }
55 }
56 }
57 2 => {
58 if e == b';' {
59 let mut v = Vec::with_capacity(text_length);
61
62 v.extend_from_slice(&text_bytes[..ep]);
63
64 let name = &text_bytes[(ep + 1)..p];
65
66 match NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name)) {
67 Ok(index) => {
68 v.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes());
69 break (v, p + 1);
70 }
71 Err(_) => break (v, ep),
72 }
73 }
74 }
75 3 => {
76 match e {
77 b'x' | b'X' => {
78 step = 5;
79 }
80 b';' => {
81 step = 0;
83 }
84 _ => step = 4,
85 }
86 }
87 4 => {
88 if e == b';' {
89 let mut v = Vec::with_capacity(text_length);
91
92 v.extend_from_slice(&text_bytes[..ep]);
93
94 let number = unsafe { text.get_unchecked((ep + 2)..p) };
95
96 match number.parse::<u32>() {
97 Ok(number) => {
98 match char::try_from(number) {
99 Ok(c) => {
100 write_char_to_vec(c, &mut v);
101 break (v, p + 1);
102 }
103 Err(_) => break (v, ep),
104 }
105 }
106 Err(_) => break (v, ep),
107 }
108 }
109 }
110 5 => {
111 match e {
112 b';' => {
113 step = 0;
115 }
116 _ => step = 6,
117 }
118 }
119 6 => {
120 if e == b';' {
121 let mut v = Vec::with_capacity(text_length);
123
124 v.extend_from_slice(&text_bytes[..ep]);
125
126 let hex = unsafe { text.get_unchecked((ep + 3)..p) };
127
128 match u32::from_str_radix(hex, 16) {
129 Ok(number) => {
130 match char::try_from(number) {
131 Ok(c) => {
132 write_char_to_vec(c, &mut v);
133 break (v, p + 1);
134 }
135 Err(_) => break (v, ep),
136 }
137 }
138 Err(_) => break (v, ep),
139 }
140 }
141 }
142 _ => unreachable!(),
143 }
144
145 p += 1;
146 };
147
148 p += 1;
149
150 step = 0;
151
152 for e in text_bytes[p..].iter().copied() {
153 match step {
154 0 => {
155 if e == b'&' {
156 step = 1;
157 ep = p;
158 }
159 }
160 1 => {
161 match e {
162 b'#' => {
163 step = 3;
164 }
165 b';' => {
166 step = 0;
168 }
169 _ => {
170 step = 2;
171 }
172 }
173 }
174 2 => {
175 if e == b';' {
176 step = 0;
178
179 let name = &text_bytes[(ep + 1)..p];
180
181 if let Ok(index) =
182 NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name))
183 {
184 v.extend_from_slice(&text_bytes[start..ep]);
185 start = p + 1;
186 v.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes());
187 }
188 }
189 }
190 3 => {
191 match e {
192 b'x' | b'X' => {
193 step = 5;
194 }
195 b';' => {
196 step = 0;
198 }
199 _ => step = 4,
200 }
201 }
202 4 => {
203 if e == b';' {
204 step = 0;
206
207 let number = unsafe { text.get_unchecked((ep + 2)..p) };
208
209 if let Ok(number) = number.parse::<u32>() {
210 if let Ok(c) = char::try_from(number) {
211 v.extend_from_slice(&text_bytes[start..ep]);
212 start = p + 1;
213 write_char_to_vec(c, &mut v);
214 }
215 }
216 }
217 }
218 5 => {
219 match e {
220 b';' => {
221 step = 0;
223 }
224 _ => step = 6,
225 }
226 }
227 6 => {
228 if e == b';' {
229 step = 0;
231
232 let hex = unsafe { text.get_unchecked((ep + 3)..p) };
233
234 if let Ok(number) = u32::from_str_radix(hex, 16) {
235 if let Ok(c) = char::try_from(number) {
236 v.extend_from_slice(&text_bytes[start..ep]);
237 start = p + 1;
238 write_char_to_vec(c, &mut v);
239 }
240 }
241 }
242 }
243 _ => unreachable!(),
244 }
245
246 p += 1;
247 }
248
249 v.extend_from_slice(&text_bytes[start..p]);
250
251 Cow::from(unsafe { String::from_utf8_unchecked(v) })
252}
253
254pub fn decode_html_entities_to_string<S: AsRef<str>>(text: S, output: &mut String) -> &str {
256 unsafe { from_utf8_unchecked(decode_html_entities_to_vec(text, output.as_mut_vec())) }
257}
258
259pub fn decode_html_entities_to_vec<S: AsRef<str>>(text: S, output: &mut Vec<u8>) -> &[u8] {
261 let text = text.as_ref();
262 let text_bytes = text.as_bytes();
263 let text_length = text_bytes.len();
264
265 output.reserve(text_length);
266
267 let current_length = output.len();
268
269 let mut start = 0;
270 let mut end = 0;
271 let mut ep = 0;
272
273 let mut step = 0;
274
275 for e in text_bytes.iter().copied() {
276 match step {
277 0 => {
278 if e == b'&' {
279 step = 1;
280 ep = end;
281 }
282 }
283 1 => {
284 match e {
285 b'#' => {
286 step = 3;
287 }
288 b';' => {
289 step = 0;
291 }
292 _ => {
293 step = 2;
294 }
295 }
296 }
297 2 => {
298 if e == b';' {
299 step = 0;
301
302 let name = &text_bytes[(ep + 1)..end];
303
304 if let Ok(index) =
305 NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name))
306 {
307 output.extend_from_slice(&text_bytes[start..ep]);
308 start = end + 1;
309 output.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes());
310 }
311 }
312 }
313 3 => {
314 match e {
315 b'x' | b'X' => {
316 step = 5;
317 }
318 b';' => {
319 step = 0;
321 }
322 _ => step = 4,
323 }
324 }
325 4 => {
326 if e == b';' {
327 step = 0;
329
330 let number = unsafe { text.get_unchecked((ep + 2)..end) };
331
332 if let Ok(number) = number.parse::<u32>() {
333 if let Ok(c) = char::try_from(number) {
334 output.extend_from_slice(&text_bytes[start..ep]);
335 start = end + 1;
336 write_char_to_vec(c, output);
337 }
338 }
339 }
340 }
341 5 => {
342 match e {
343 b';' => {
344 step = 0;
346 }
347 _ => step = 6,
348 }
349 }
350 6 => {
351 if e == b';' {
352 step = 0;
354
355 let hex = unsafe { text.get_unchecked((ep + 3)..end) };
356
357 if let Ok(number) = u32::from_str_radix(hex, 16) {
358 if let Ok(c) = char::try_from(number) {
359 output.extend_from_slice(&text_bytes[start..ep]);
360 start = end + 1;
361 write_char_to_vec(c, output);
362 }
363 }
364 }
365 }
366 _ => unreachable!(),
367 }
368
369 end += 1;
370 }
371
372 output.extend_from_slice(&text_bytes[start..end]);
373
374 &output[current_length..]
375}
376
377#[cfg(feature = "std")]
378pub fn decode_html_entities_to_writer<S: AsRef<str>, W: Write>(
380 text: S,
381 output: &mut W,
382) -> Result<(), io::Error> {
383 let text = text.as_ref();
384 let text_bytes = text.as_bytes();
385
386 let mut start = 0;
387 let mut end = 0;
388 let mut ep = 0;
389
390 let mut step = 0;
391
392 for e in text_bytes.iter().copied() {
393 match step {
394 0 => {
395 if e == b'&' {
396 step = 1;
397 ep = end;
398 }
399 }
400 1 => {
401 match e {
402 b'#' => {
403 step = 3;
404 }
405 b';' => {
406 step = 0;
408 }
409 _ => {
410 step = 2;
411 }
412 }
413 }
414 2 => {
415 if e == b';' {
416 step = 0;
418
419 let name = &text_bytes[(ep + 1)..end];
420
421 if let Ok(index) =
422 NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name))
423 {
424 output.write_all(&text_bytes[start..ep])?;
425 start = end + 1;
426 output.write_all(NAMED_ENTITIES[index].1.as_bytes())?;
427 }
428 }
429 }
430 3 => {
431 match e {
432 b'x' | b'X' => {
433 step = 5;
434 }
435 b';' => {
436 step = 0;
438 }
439 _ => step = 4,
440 }
441 }
442 4 => {
443 if e == b';' {
444 step = 0;
446
447 let number = unsafe { text.get_unchecked((ep + 2)..end) };
448
449 if let Ok(number) = number.parse::<u32>() {
450 if let Ok(c) = char::try_from(number) {
451 output.write_all(&text_bytes[start..ep])?;
452 start = end + 1;
453 write_char_to_writer(c, output)?;
454 }
455 }
456 }
457 }
458 5 => {
459 match e {
460 b';' => {
461 step = 0;
463 }
464 _ => step = 6,
465 }
466 }
467 6 => {
468 if e == b';' {
469 step = 0;
471
472 let hex = unsafe { text.get_unchecked((ep + 3)..end) };
473
474 if let Ok(number) = u32::from_str_radix(hex, 16) {
475 if let Ok(c) = char::try_from(number) {
476 output.write_all(&text_bytes[start..ep])?;
477 start = end + 1;
478 write_char_to_writer(c, output)?;
479 }
480 }
481 }
482 }
483 _ => unreachable!(),
484 }
485
486 end += 1;
487 }
488
489 output.write_all(&text_bytes[start..end])
490}