minify_html_onepass/proc/
entity.rs

1// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
2// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
3// - Some character entity references do not end with a semicolon.
4//   - All of these entities also have a corresponding entity with semicolon.
5// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
6//   semicolon).
7// - All entity names are at least 2 characters long.
8// - Some named entities are actually shorter than their decoded characters as UTF-8.
9
10// Browser implementation behaviour to consider:
11// - Browsers match longest sequence of characters that would form a valid entity.
12// - Names must match case sensitively.
13// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
14//   Unicode Scalar Value.
15
16use crate::proc::Processor;
17use minify_html_common::gen::codepoints::Lookup;
18use minify_html_common::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
19use minify_html_common::gen::codepoints::DIGIT;
20use minify_html_common::gen::codepoints::HEX_DIGIT;
21use minify_html_common::gen::codepoints::LOWER_HEX_ALPHA;
22use minify_html_common::gen::codepoints::UPPER_HEX_ALPHA;
23use minify_html_common::gen::entities::EntityType;
24use minify_html_common::gen::entities::ENTITY;
25use minify_html_common::pattern::TrieNodeMatch;
26use std::char::from_u32;
27
28enum Parsed {
29  // This includes numeric entities that were invalid and decoded to 0xFFFD.
30  Decoded { read_len: usize, write_len: usize },
31  // Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
32  // Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char
33  // in attribute values are also not decoded due to the spec. (See parser below for more details.)
34  LeftEncoded,
35  // This is for any entity-like sequence that couldn't match the `ENTITY` trie.
36  Invalid { len: usize },
37}
38
39#[inline(always)]
40fn parse_numeric_entity(
41  code: &mut [u8],
42  read_start: usize,
43  prefix_len: usize,
44  write_pos: usize,
45  digit_lookup: &'static Lookup,
46  on_digit: fn(u32, u8) -> u32,
47  max_digits: usize,
48) -> Parsed {
49  let mut value = 0u32;
50  let mut digits = 0;
51  let mut read_next = read_start + prefix_len;
52  // Skip initial zeros.
53  while code.get(read_next).filter(|c| **c == b'0').is_some() {
54    read_next += 1;
55  }
56  // Browser will still continue to consume digits past max_digits.
57  loop {
58    match code.get(read_next) {
59      Some(&c) if digit_lookup[c] => {
60        // We don't care about overflow, as it will be considered malformed past max_digits anyway.
61        value = on_digit(value, c);
62        read_next += 1;
63        digits += 1;
64      }
65      _ => break,
66    };
67  }
68  // Semicolon is required by spec but seems to be optional in actual browser behaviour.
69  if let Some(b';') = code.get(read_next) {
70    read_next += 1;
71  };
72  // Browsers decode to a replacement character (U+FFFD) if malformed.
73  let char = Some(value)
74    .filter(|_| digits <= max_digits)
75    .and_then(from_u32)
76    .unwrap_or('\u{FFFD}');
77  Parsed::Decoded {
78    read_len: read_next - read_start,
79    write_len: char.encode_utf8(&mut code[write_pos..]).len(),
80  }
81}
82
83// Parse the entity and write its decoded value at {@param write_pos}.
84// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
85fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
86  match ENTITY.longest_matching_prefix(&code[read_pos..]) {
87    TrieNodeMatch::Found {
88      len: match_len,
89      value,
90    } => match value {
91      EntityType::Dec => parse_numeric_entity(
92        code,
93        read_pos,
94        // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
95        2,
96        write_pos,
97        DIGIT,
98        |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
99        7,
100      ),
101      EntityType::Hex => parse_numeric_entity(
102        code,
103        read_pos,
104        // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
105        3,
106        write_pos,
107        HEX_DIGIT,
108        |value, c| {
109          value.wrapping_mul(16).wrapping_add(match c {
110            c if DIGIT[c] => (c - b'0') as u32,
111            c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
112            c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
113            _ => unreachable!(),
114          })
115        },
116        6,
117      ),
118      EntityType::Named(decoded) => {
119        // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
120        if decoded.len() > match_len
121          || in_attr_val
122            && *code.get(read_pos + match_len - 1).unwrap() != b';'
123            && code
124              .get(read_pos + match_len)
125              .filter(|c| ALPHANUMERIC_OR_EQUALS[**c])
126              .is_some()
127        {
128          Parsed::LeftEncoded
129        } else {
130          code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
131          Parsed::Decoded {
132            read_len: match_len,
133            write_len: decoded.len(),
134          }
135        }
136      }
137    },
138    // The entity is malformed.
139    TrieNodeMatch::NotFound { reached } => Parsed::Invalid { len: reached },
140  }
141}
142
143// Normalise entity such that "&lt; hello" becomes "___< hello".
144// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello".
145pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
146  if proc.peek(0).filter(|c| *c == b'&').is_none() {
147    return false;
148  };
149
150  let start = proc.read_next;
151
152  // We want to look ahead in case this entity decodes to something beginning with '&' and the following code (after
153  // any decoding) would form an unintentional entity.
154  // For example, `&a&#109p;` would output as `&amp`, which is an unintentional entity.
155  let mut read_next = start;
156  let mut write_next = start;
157  let mut node = ENTITY;
158  while node.value.is_none() {
159    match proc.code.get(read_next) {
160      None => break,
161      Some(b'&') => {
162        // Decode before checking to see if it continues current entity.
163        let (read_len, write_len) =
164          match parse_entity(proc.code, read_next, write_next, in_attr_val) {
165            Parsed::LeftEncoded => {
166              // Don't mistake an intentionally undecoded entity for an unintentional entity.
167              break;
168            }
169            Parsed::Decoded {
170              read_len,
171              write_len,
172            } => {
173              debug_assert!(read_len > 0);
174              debug_assert!(write_len > 0);
175              (read_len, write_len)
176            }
177            Parsed::Invalid { len } => {
178              debug_assert!(len > 0);
179              // We only want to keep reading entities that will decode. No entity has an ampersand after the
180              // first character, so we don't need to keep checking if we see one; however, malformed entities
181              // could be part of their own unintentional entity, so don't consume them.
182              //
183              // For example:
184              // &am&am&#112;
185              // When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
186              // won't be normalised to `&ampamp;`.
187              if read_next != start {
188                break;
189              };
190              proc
191                .code
192                .copy_within(read_next..read_next + len, write_next);
193              (len, len)
194            }
195          };
196        debug_assert!(read_len > 0);
197
198        let (new_node, match_len) =
199          node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
200        node = new_node;
201        read_next += read_len;
202        write_next += write_len;
203        if match_len < write_len {
204          // Either new_node has a value, or we can't match anymore and so there will definitely be no
205          // unintentional entity.
206          break;
207        };
208      }
209      Some(_) => {
210        let (new_node, new_read_next) = node.shortest_matching_prefix(proc.code, read_next);
211        let len = new_read_next - read_next;
212        if len == 0 {
213          break;
214        };
215        proc.code.copy_within(read_next..new_read_next, write_next);
216        read_next += len;
217        write_next += len;
218        node = new_node;
219      }
220    };
221  }
222  // Check if we need to encode initial '&' and add 'amp'.
223  let undecodable = node.value.is_some();
224  // Shift decoded value down so that it ends at read_next (exclusive).
225  let mut shifted_start = read_next - (write_next - start - undecodable as usize);
226  proc
227    .code
228    .copy_within(start + undecodable as usize..write_next, shifted_start);
229  if undecodable {
230    debug_assert_eq!(proc.code.get(start), Some(&b'&'));
231    proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&amp");
232    shifted_start -= 4;
233  };
234
235  proc.read_next = shifted_start;
236  true
237}