minify_html/entity/
decode.rs1use memchr::memchr;
17use minify_html_common::gen::codepoints::Lookup;
18use minify_html_common::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
19use minify_html_common::gen::codepoints::DIGIT;
20use minify_html_common::gen::codepoints::HEX_DIGIT;
21use minify_html_common::gen::codepoints::LOWER_HEX_ALPHA;
22use minify_html_common::gen::codepoints::UPPER_HEX_ALPHA;
23use minify_html_common::gen::entities::EntityType;
24use minify_html_common::gen::entities::ENTITY;
25use minify_html_common::pattern::TrieNodeMatch;
26use std::char::from_u32;
27
28enum Decoded {
29 Ignored,
30 Named(&'static [u8]),
31 Numeric(char),
32}
33
34struct ParsedEntity {
35 decoded: Decoded,
36 read_len: usize,
37}
38
39fn parse_numeric_entity(
40 code: &[u8],
41 read_start: usize,
43 digit_lookup: &'static Lookup,
44 on_digit: fn(u32, u8) -> u32,
45 max_digits: usize,
46) -> ParsedEntity {
47 let mut value = 0u32;
48 let mut digits = 0;
49 let mut read_next = read_start;
50 while code.get(read_next).filter(|c| **c == b'0').is_some() {
52 read_next += 1;
53 }
54 loop {
56 match code.get(read_next) {
57 Some(&c) if digit_lookup[c] => {
58 value = on_digit(value, c);
60 read_next += 1;
61 digits += 1;
62 }
63 _ => break,
64 };
65 }
66 if let Some(b';') = code.get(read_next) {
68 read_next += 1;
69 };
70 let char = Some(value)
72 .filter(|_| digits <= max_digits)
73 .and_then(from_u32)
74 .unwrap_or('\u{FFFD}');
75 ParsedEntity {
76 read_len: read_next,
77 decoded: Decoded::Numeric(char),
78 }
79}
80
81fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
82 match ENTITY.longest_matching_prefix(code) {
83 TrieNodeMatch::NotFound { reached } => ParsedEntity {
85 read_len: reached,
86 decoded: Decoded::Ignored,
87 },
88 TrieNodeMatch::Found {
89 len: match_len,
90 value,
91 } => match value {
92 EntityType::Dec => parse_numeric_entity(
93 code,
94 2,
96 DIGIT,
97 |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
98 7,
99 ),
100 EntityType::Hex => parse_numeric_entity(
101 code,
102 3,
104 HEX_DIGIT,
105 |value, c| {
106 value.wrapping_mul(16).wrapping_add(match c {
107 c if DIGIT[c] => (c - b'0') as u32,
108 c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
109 c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
110 _ => unreachable!(),
111 })
112 },
113 6,
114 ),
115 EntityType::Named(decoded) => {
116 if in_attr_val
117 && code[match_len - 1] != b';'
118 && code
119 .get(match_len)
120 .filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
121 .is_some()
122 {
123 ParsedEntity {
126 read_len: match_len,
127 decoded: Decoded::Ignored,
128 }
129 } else {
130 ParsedEntity {
132 read_len: match_len,
133 decoded: Decoded::Named(decoded),
134 }
135 }
136 }
137 },
138 }
139}
140
141pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
142 let mut res = Vec::<u8>::new();
143 while !code.is_empty() {
144 let (before, matched) = match memchr(b'&', code) {
145 None => (code.len(), false),
146 Some(n) => (n, true),
147 };
148 res.extend_from_slice(&code[..before]);
149 code = &code[before..];
150 if matched {
151 let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
152 match decoded {
153 Decoded::Numeric(c) => {
154 let mut buf = [0u8; 4];
155 let encoded = c.encode_utf8(&mut buf);
156 res.extend_from_slice(encoded.as_bytes());
157 }
158 Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
159 Decoded::Named(s) => res.extend_from_slice(s),
160 };
161 code = &code[read_len..];
162 };
163 }
164 res
165}