html_escape/decode/html_entity/
mod.rs

1mod tables;
2
3pub use tables::*;
4
5use core::convert::TryFrom;
6use core::str::from_utf8_unchecked;
7
8use alloc::borrow::Cow;
9use alloc::string::String;
10use alloc::vec::Vec;
11
12#[cfg(feature = "std")]
13use std::io::{self, Write};
14
15use crate::functions::*;
16
17/// Decode html entities in a given string.
18pub fn decode_html_entities<S: ?Sized + AsRef<str>>(text: &S) -> Cow<str> {
19    let text = text.as_ref();
20    let text_bytes = text.as_bytes();
21    let text_length = text_bytes.len();
22
23    let mut p = 0;
24    let mut ep = 0;
25    let mut e;
26
27    let mut step = 0;
28
29    let (mut v, mut start) = loop {
30        if p == text_length {
31            return Cow::from(text);
32        }
33
34        e = text_bytes[p];
35
36        match step {
37            0 => {
38                if e == b'&' {
39                    step = 1;
40                    ep = p;
41                }
42            }
43            1 => {
44                match e {
45                    b'#' => {
46                        step = 3;
47                    }
48                    b';' => {
49                        // incorrect
50                        step = 0;
51                    }
52                    _ => {
53                        step = 2;
54                    }
55                }
56            }
57            2 => {
58                if e == b';' {
59                    // named
60                    let mut v = Vec::with_capacity(text_length);
61
62                    v.extend_from_slice(&text_bytes[..ep]);
63
64                    let name = &text_bytes[(ep + 1)..p];
65
66                    match NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name)) {
67                        Ok(index) => {
68                            v.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes());
69                            break (v, p + 1);
70                        }
71                        Err(_) => break (v, ep),
72                    }
73                }
74            }
75            3 => {
76                match e {
77                    b'x' | b'X' => {
78                        step = 5;
79                    }
80                    b';' => {
81                        // incorrect
82                        step = 0;
83                    }
84                    _ => step = 4,
85                }
86            }
87            4 => {
88                if e == b';' {
89                    // numeric
90                    let mut v = Vec::with_capacity(text_length);
91
92                    v.extend_from_slice(&text_bytes[..ep]);
93
94                    let number = unsafe { text.get_unchecked((ep + 2)..p) };
95
96                    match number.parse::<u32>() {
97                        Ok(number) => {
98                            match char::try_from(number) {
99                                Ok(c) => {
100                                    write_char_to_vec(c, &mut v);
101                                    break (v, p + 1);
102                                }
103                                Err(_) => break (v, ep),
104                            }
105                        }
106                        Err(_) => break (v, ep),
107                    }
108                }
109            }
110            5 => {
111                match e {
112                    b';' => {
113                        // incorrect
114                        step = 0;
115                    }
116                    _ => step = 6,
117                }
118            }
119            6 => {
120                if e == b';' {
121                    // hex
122                    let mut v = Vec::with_capacity(text_length);
123
124                    v.extend_from_slice(&text_bytes[..ep]);
125
126                    let hex = unsafe { text.get_unchecked((ep + 3)..p) };
127
128                    match u32::from_str_radix(hex, 16) {
129                        Ok(number) => {
130                            match char::try_from(number) {
131                                Ok(c) => {
132                                    write_char_to_vec(c, &mut v);
133                                    break (v, p + 1);
134                                }
135                                Err(_) => break (v, ep),
136                            }
137                        }
138                        Err(_) => break (v, ep),
139                    }
140                }
141            }
142            _ => unreachable!(),
143        }
144
145        p += 1;
146    };
147
148    p += 1;
149
150    step = 0;
151
152    for e in text_bytes[p..].iter().copied() {
153        match step {
154            0 => {
155                if e == b'&' {
156                    step = 1;
157                    ep = p;
158                }
159            }
160            1 => {
161                match e {
162                    b'#' => {
163                        step = 3;
164                    }
165                    b';' => {
166                        // incorrect
167                        step = 0;
168                    }
169                    _ => {
170                        step = 2;
171                    }
172                }
173            }
174            2 => {
175                if e == b';' {
176                    // named
177                    step = 0;
178
179                    let name = &text_bytes[(ep + 1)..p];
180
181                    if let Ok(index) =
182                        NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name))
183                    {
184                        v.extend_from_slice(&text_bytes[start..ep]);
185                        start = p + 1;
186                        v.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes());
187                    }
188                }
189            }
190            3 => {
191                match e {
192                    b'x' | b'X' => {
193                        step = 5;
194                    }
195                    b';' => {
196                        // incorrect
197                        step = 0;
198                    }
199                    _ => step = 4,
200                }
201            }
202            4 => {
203                if e == b';' {
204                    // numeric
205                    step = 0;
206
207                    let number = unsafe { text.get_unchecked((ep + 2)..p) };
208
209                    if let Ok(number) = number.parse::<u32>() {
210                        if let Ok(c) = char::try_from(number) {
211                            v.extend_from_slice(&text_bytes[start..ep]);
212                            start = p + 1;
213                            write_char_to_vec(c, &mut v);
214                        }
215                    }
216                }
217            }
218            5 => {
219                match e {
220                    b';' => {
221                        // incorrect
222                        step = 0;
223                    }
224                    _ => step = 6,
225                }
226            }
227            6 => {
228                if e == b';' {
229                    // hex
230                    step = 0;
231
232                    let hex = unsafe { text.get_unchecked((ep + 3)..p) };
233
234                    if let Ok(number) = u32::from_str_radix(hex, 16) {
235                        if let Ok(c) = char::try_from(number) {
236                            v.extend_from_slice(&text_bytes[start..ep]);
237                            start = p + 1;
238                            write_char_to_vec(c, &mut v);
239                        }
240                    }
241                }
242            }
243            _ => unreachable!(),
244        }
245
246        p += 1;
247    }
248
249    v.extend_from_slice(&text_bytes[start..p]);
250
251    Cow::from(unsafe { String::from_utf8_unchecked(v) })
252}
253
254/// Decode html entities in a given string to a mutable `String` reference and return the decoded string slice.
255pub fn decode_html_entities_to_string<S: AsRef<str>>(text: S, output: &mut String) -> &str {
256    unsafe { from_utf8_unchecked(decode_html_entities_to_vec(text, output.as_mut_vec())) }
257}
258
259/// Decode html entities in a given string to a mutable `Vec<u8>` reference and return the decoded data slice.
260pub fn decode_html_entities_to_vec<S: AsRef<str>>(text: S, output: &mut Vec<u8>) -> &[u8] {
261    let text = text.as_ref();
262    let text_bytes = text.as_bytes();
263    let text_length = text_bytes.len();
264
265    output.reserve(text_length);
266
267    let current_length = output.len();
268
269    let mut start = 0;
270    let mut end = 0;
271    let mut ep = 0;
272
273    let mut step = 0;
274
275    for e in text_bytes.iter().copied() {
276        match step {
277            0 => {
278                if e == b'&' {
279                    step = 1;
280                    ep = end;
281                }
282            }
283            1 => {
284                match e {
285                    b'#' => {
286                        step = 3;
287                    }
288                    b';' => {
289                        // incorrect
290                        step = 0;
291                    }
292                    _ => {
293                        step = 2;
294                    }
295                }
296            }
297            2 => {
298                if e == b';' {
299                    // named
300                    step = 0;
301
302                    let name = &text_bytes[(ep + 1)..end];
303
304                    if let Ok(index) =
305                        NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name))
306                    {
307                        output.extend_from_slice(&text_bytes[start..ep]);
308                        start = end + 1;
309                        output.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes());
310                    }
311                }
312            }
313            3 => {
314                match e {
315                    b'x' | b'X' => {
316                        step = 5;
317                    }
318                    b';' => {
319                        // incorrect
320                        step = 0;
321                    }
322                    _ => step = 4,
323                }
324            }
325            4 => {
326                if e == b';' {
327                    // numeric
328                    step = 0;
329
330                    let number = unsafe { text.get_unchecked((ep + 2)..end) };
331
332                    if let Ok(number) = number.parse::<u32>() {
333                        if let Ok(c) = char::try_from(number) {
334                            output.extend_from_slice(&text_bytes[start..ep]);
335                            start = end + 1;
336                            write_char_to_vec(c, output);
337                        }
338                    }
339                }
340            }
341            5 => {
342                match e {
343                    b';' => {
344                        // incorrect
345                        step = 0;
346                    }
347                    _ => step = 6,
348                }
349            }
350            6 => {
351                if e == b';' {
352                    // hex
353                    step = 0;
354
355                    let hex = unsafe { text.get_unchecked((ep + 3)..end) };
356
357                    if let Ok(number) = u32::from_str_radix(hex, 16) {
358                        if let Ok(c) = char::try_from(number) {
359                            output.extend_from_slice(&text_bytes[start..ep]);
360                            start = end + 1;
361                            write_char_to_vec(c, output);
362                        }
363                    }
364                }
365            }
366            _ => unreachable!(),
367        }
368
369        end += 1;
370    }
371
372    output.extend_from_slice(&text_bytes[start..end]);
373
374    &output[current_length..]
375}
376
377#[cfg(feature = "std")]
378/// Decode html entities in a given string to a writer.
379pub fn decode_html_entities_to_writer<S: AsRef<str>, W: Write>(
380    text: S,
381    output: &mut W,
382) -> Result<(), io::Error> {
383    let text = text.as_ref();
384    let text_bytes = text.as_bytes();
385
386    let mut start = 0;
387    let mut end = 0;
388    let mut ep = 0;
389
390    let mut step = 0;
391
392    for e in text_bytes.iter().copied() {
393        match step {
394            0 => {
395                if e == b'&' {
396                    step = 1;
397                    ep = end;
398                }
399            }
400            1 => {
401                match e {
402                    b'#' => {
403                        step = 3;
404                    }
405                    b';' => {
406                        // incorrect
407                        step = 0;
408                    }
409                    _ => {
410                        step = 2;
411                    }
412                }
413            }
414            2 => {
415                if e == b';' {
416                    // named
417                    step = 0;
418
419                    let name = &text_bytes[(ep + 1)..end];
420
421                    if let Ok(index) =
422                        NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name))
423                    {
424                        output.write_all(&text_bytes[start..ep])?;
425                        start = end + 1;
426                        output.write_all(NAMED_ENTITIES[index].1.as_bytes())?;
427                    }
428                }
429            }
430            3 => {
431                match e {
432                    b'x' | b'X' => {
433                        step = 5;
434                    }
435                    b';' => {
436                        // incorrect
437                        step = 0;
438                    }
439                    _ => step = 4,
440                }
441            }
442            4 => {
443                if e == b';' {
444                    // numeric
445                    step = 0;
446
447                    let number = unsafe { text.get_unchecked((ep + 2)..end) };
448
449                    if let Ok(number) = number.parse::<u32>() {
450                        if let Ok(c) = char::try_from(number) {
451                            output.write_all(&text_bytes[start..ep])?;
452                            start = end + 1;
453                            write_char_to_writer(c, output)?;
454                        }
455                    }
456                }
457            }
458            5 => {
459                match e {
460                    b';' => {
461                        // incorrect
462                        step = 0;
463                    }
464                    _ => step = 6,
465                }
466            }
467            6 => {
468                if e == b';' {
469                    // hex
470                    step = 0;
471
472                    let hex = unsafe { text.get_unchecked((ep + 3)..end) };
473
474                    if let Ok(number) = u32::from_str_radix(hex, 16) {
475                        if let Ok(c) = char::try_from(number) {
476                            output.write_all(&text_bytes[start..ep])?;
477                            start = end + 1;
478                            write_char_to_writer(c, output)?;
479                        }
480                    }
481                }
482            }
483            _ => unreachable!(),
484        }
485
486        end += 1;
487    }
488
489    output.write_all(&text_bytes[start..end])
490}