Skip to main content

fhp_tokenizer/
entity.rs

1//! Entity decoding with SIMD fast-path.
2//!
3//! If the input contains no `&` characters, returns `Cow::Borrowed` (zero
4//! allocation). Otherwise, decodes named (`&`), decimal (`<`), and
5//! hex (`<`) character references using `fhp_core::entity`.
6
7use std::borrow::Cow;
8
9/// Decode HTML entities in a string.
10///
11/// Fast path: if no `&` is present, returns `Cow::Borrowed` with zero
12/// allocation. Otherwise, decodes entities and returns `Cow::Owned`.
13///
14/// # Examples
15///
16/// ```
17/// use fhp_tokenizer::entity::decode_entities;
18///
19/// assert_eq!(decode_entities("hello"), "hello");
20/// assert_eq!(decode_entities("a & b"), "a & b");
21/// assert_eq!(decode_entities("&#60;div&#62;"), "<div>");
22/// assert_eq!(decode_entities("&#x3C;br&#x3E;"), "<br>");
23/// ```
24pub fn decode_entities<'a>(input: &'a str) -> Cow<'a, str> {
25    // Fast path: no ampersand → no entities to decode.
26    if !input.as_bytes().contains(&b'&') {
27        return Cow::Borrowed(input);
28    }
29
30    decode_entities_slow(input)
31}
32
33/// Slow path: actually decode entities.
34fn decode_entities_slow(input: &str) -> Cow<'_, str> {
35    let mut result = String::with_capacity(input.len());
36    let mut cursor = 0usize;
37
38    while let Some(rel_amp) = input[cursor..].find('&') {
39        let amp = cursor + rel_amp;
40        // Preserve all UTF-8 before '&' verbatim.
41        result.push_str(&input[cursor..amp]);
42
43        // Look for the closing ';' after '&'.
44        if let Some(rel_semi) = input[amp + 1..].find(';') {
45            let semi = amp + 1 + rel_semi;
46            let entity_body = &input[amp + 1..semi];
47            if try_decode_entity_into(entity_body, &mut result) {
48                cursor = semi + 1;
49                continue;
50            }
51        }
52
53        // Unrecognized entity — keep '&' and continue scanning.
54        result.push('&');
55        cursor = amp + 1;
56    }
57
58    // Append remaining tail.
59    if cursor < input.len() {
60        result.push_str(&input[cursor..]);
61    }
62
63    Cow::Owned(result)
64}
65
66/// Try to decode a single entity body directly into the result buffer.
67///
68/// Returns `true` if decoded successfully, writing directly to `result`
69/// without any intermediate `String` allocation.
70fn try_decode_entity_into(body: &str, result: &mut String) -> bool {
71    if body.is_empty() {
72        return false;
73    }
74
75    if let Some(digits) = body.strip_prefix('#') {
76        // Numeric entity.
77        if digits.starts_with('x') || digits.starts_with('X') {
78            // Hex: &#xHH;
79            if let Some(c) = fhp_core::entity::decode_numeric(&digits[1..], true) {
80                result.push(c);
81                return true;
82            }
83        } else {
84            // Decimal: &#DD;
85            if let Some(c) = fhp_core::entity::decode_numeric(digits, false) {
86                result.push(c);
87                return true;
88            }
89        }
90    } else {
91        // Named entity — &'static str, zero alloc.
92        if let Some(s) = fhp_core::entity::decode_named(body) {
93            result.push_str(s);
94            return true;
95        }
96    }
97
98    false
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    #[test]
106    fn no_entities_borrowed() {
107        let result = decode_entities("hello world");
108        assert!(matches!(result, Cow::Borrowed(_)));
109        assert_eq!(result, "hello world");
110    }
111
112    #[test]
113    fn named_entities() {
114        assert_eq!(decode_entities("&amp;"), "&");
115        assert_eq!(decode_entities("&lt;"), "<");
116        assert_eq!(decode_entities("&gt;"), ">");
117        assert_eq!(decode_entities("&quot;"), "\"");
118        assert_eq!(decode_entities("&apos;"), "'");
119    }
120
121    #[test]
122    fn numeric_decimal() {
123        assert_eq!(decode_entities("&#60;"), "<");
124        assert_eq!(decode_entities("&#62;"), ">");
125        assert_eq!(decode_entities("&#38;"), "&");
126    }
127
128    #[test]
129    fn numeric_hex() {
130        assert_eq!(decode_entities("&#x3C;"), "<");
131        assert_eq!(decode_entities("&#x3E;"), ">");
132        assert_eq!(decode_entities("&#X3c;"), "<");
133    }
134
135    #[test]
136    fn mixed_entities() {
137        assert_eq!(decode_entities("a &amp; b &lt; c &#62; d"), "a & b < c > d");
138    }
139
140    #[test]
141    fn unknown_entity_passthrough() {
142        assert_eq!(decode_entities("&unknown;"), "&unknown;");
143    }
144
145    #[test]
146    fn ampersand_without_semicolon() {
147        assert_eq!(decode_entities("a & b"), "a & b");
148    }
149
150    #[test]
151    fn empty_input() {
152        let result = decode_entities("");
153        assert!(matches!(result, Cow::Borrowed(_)));
154        assert_eq!(result, "");
155    }
156
157    #[test]
158    fn entity_at_end() {
159        assert_eq!(decode_entities("hello&amp;"), "hello&");
160    }
161
162    #[test]
163    fn consecutive_entities() {
164        assert_eq!(decode_entities("&lt;&gt;&amp;"), "<>&");
165    }
166}