minify_html/entity/
decode.rs

1// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
2// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
3// - Some character entity references do not end with a semicolon.
4//   - All of these entities also have a corresponding entity with semicolon.
5// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
6//   semicolon).
7// - All entity names are at least 2 characters long.
8// - Some named entities are actually shorter than their decoded characters as UTF-8.
9
10// Browser implementation behaviour to consider:
11// - Browsers match longest sequence of characters that would form a valid entity.
12// - Names must match case sensitively.
13// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
14//   Unicode Scalar Value.
15
16use memchr::memchr;
17use minify_html_common::gen::codepoints::Lookup;
18use minify_html_common::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
19use minify_html_common::gen::codepoints::DIGIT;
20use minify_html_common::gen::codepoints::HEX_DIGIT;
21use minify_html_common::gen::codepoints::LOWER_HEX_ALPHA;
22use minify_html_common::gen::codepoints::UPPER_HEX_ALPHA;
23use minify_html_common::gen::entities::EntityType;
24use minify_html_common::gen::entities::ENTITY;
25use minify_html_common::pattern::TrieNodeMatch;
26use std::char::from_u32;
27
28enum Decoded {
29  Ignored,
30  Named(&'static [u8]),
31  Numeric(char),
32}
33
34struct ParsedEntity {
35  decoded: Decoded,
36  read_len: usize,
37}
38
39fn parse_numeric_entity(
40  code: &[u8],
41  // read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct.
42  read_start: usize,
43  digit_lookup: &'static Lookup,
44  on_digit: fn(u32, u8) -> u32,
45  max_digits: usize,
46) -> ParsedEntity {
47  let mut value = 0u32;
48  let mut digits = 0;
49  let mut read_next = read_start;
50  // Skip initial zeros.
51  while code.get(read_next).filter(|c| **c == b'0').is_some() {
52    read_next += 1;
53  }
54  // Browser will still continue to consume digits past max_digits.
55  loop {
56    match code.get(read_next) {
57      Some(&c) if digit_lookup[c] => {
58        // We don't care about overflow, as it will be considered malformed past max_digits anyway.
59        value = on_digit(value, c);
60        read_next += 1;
61        digits += 1;
62      }
63      _ => break,
64    };
65  }
66  // Semicolon is required by spec but seems to be optional in actual browser behaviour.
67  if let Some(b';') = code.get(read_next) {
68    read_next += 1;
69  };
70  // Browsers decode to a replacement character (U+FFFD) if malformed.
71  let char = Some(value)
72    .filter(|_| digits <= max_digits)
73    .and_then(from_u32)
74    .unwrap_or('\u{FFFD}');
75  ParsedEntity {
76    read_len: read_next,
77    decoded: Decoded::Numeric(char),
78  }
79}
80
81fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
82  match ENTITY.longest_matching_prefix(code) {
83    // The entity is malformed.
84    TrieNodeMatch::NotFound { reached } => ParsedEntity {
85      read_len: reached,
86      decoded: Decoded::Ignored,
87    },
88    TrieNodeMatch::Found {
89      len: match_len,
90      value,
91    } => match value {
92      EntityType::Dec => parse_numeric_entity(
93        code,
94        // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
95        2,
96        DIGIT,
97        |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
98        7,
99      ),
100      EntityType::Hex => parse_numeric_entity(
101        code,
102        // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
103        3,
104        HEX_DIGIT,
105        |value, c| {
106          value.wrapping_mul(16).wrapping_add(match c {
107            c if DIGIT[c] => (c - b'0') as u32,
108            c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
109            c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
110            _ => unreachable!(),
111          })
112        },
113        6,
114      ),
115      EntityType::Named(decoded) => {
116        if in_attr_val
117          && code[match_len - 1] != b';'
118          && code
119            .get(match_len)
120            .filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
121            .is_some()
122        {
123          // Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character.
124          // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
125          ParsedEntity {
126            read_len: match_len,
127            decoded: Decoded::Ignored,
128          }
129        } else {
130          // NOTE: `decoded` might be in encoded form if encoded form is shorter than decoded.
131          ParsedEntity {
132            read_len: match_len,
133            decoded: Decoded::Named(decoded),
134          }
135        }
136      }
137    },
138  }
139}
140
141pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
142  let mut res = Vec::<u8>::new();
143  while !code.is_empty() {
144    let (before, matched) = match memchr(b'&', code) {
145      None => (code.len(), false),
146      Some(n) => (n, true),
147    };
148    res.extend_from_slice(&code[..before]);
149    code = &code[before..];
150    if matched {
151      let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
152      match decoded {
153        Decoded::Numeric(c) => {
154          let mut buf = [0u8; 4];
155          let encoded = c.encode_utf8(&mut buf);
156          res.extend_from_slice(encoded.as_bytes());
157        }
158        Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
159        Decoded::Named(s) => res.extend_from_slice(s),
160      };
161      code = &code[read_len..];
162    };
163  }
164  res
165}