Skip to main content

fhp_core/
entity.rs

1/// Decode a named HTML entity to its character(s).
2///
3/// Accepts the entity name **without** the leading `&` and trailing `;`.
4/// Returns `None` for unknown entity names.
5///
6/// # Examples
7///
8/// ```
9/// use fhp_core::entity::decode_named;
10///
11/// assert_eq!(decode_named("amp"), Some("&"));
12/// assert_eq!(decode_named("lt"), Some("<"));
13/// assert_eq!(decode_named("nonexistent"), None);
14/// ```
15#[inline]
16pub fn decode_named(name: &str) -> Option<&'static str> {
17    ENTITY_MAP.get(name).copied()
18}
19
20/// Decode a numeric character reference (`&#123;` or `&#x1F600;`).
21///
22/// Accepts the digits **without** `&#`, `&#x`, or the trailing `;`.
23/// `is_hex` indicates whether the reference uses hexadecimal.
24///
25/// Returns `None` if the codepoint is not a valid Unicode scalar value
26/// or is a disallowed control character (per the HTML spec, U+0000 is
27/// replaced with U+FFFD).
28///
29/// # Examples
30///
31/// ```
32/// use fhp_core::entity::decode_numeric;
33///
34/// assert_eq!(decode_numeric("60", false), Some('<'));
35/// assert_eq!(decode_numeric("3C", true), Some('<'));
36/// assert_eq!(decode_numeric("0", false), Some('\u{FFFD}'));
37/// ```
38pub fn decode_numeric(digits: &str, is_hex: bool) -> Option<char> {
39    let codepoint = if is_hex {
40        u32::from_str_radix(digits, 16).ok()?
41    } else {
42        digits.parse::<u32>().ok()?
43    };
44
45    if codepoint == 0 {
46        return Some('\u{FFFD}');
47    }
48
49    char::from_u32(codepoint)
50}
51
52/// Escape HTML text content: `&` → `&amp;`, `<` → `&lt;`, `>` → `&gt;`.
53///
54/// Writes the escaped output into `out`. Unescaped segments are flushed in
55/// bulk for performance — only special characters cause a pause.
56///
57/// # Examples
58///
59/// ```
60/// use fhp_core::entity::escape_text;
61///
62/// let mut buf = String::new();
63/// escape_text("1 < 2 & 3 > 0", &mut buf);
64/// assert_eq!(buf, "1 &lt; 2 &amp; 3 &gt; 0");
65/// ```
66#[inline]
67pub fn escape_text(input: &str, out: &mut String) {
68    escape_impl::<false>(input, out);
69}
70
71/// Escape HTML attribute values: `&` → `&amp;`, `<` → `&lt;`, `>` → `&gt;`, `"` → `&quot;`, `'` → `&#39;`.
72///
73/// Writes the escaped output into `out`. Like [`escape_text`], unescaped
74/// segments are flushed in bulk.
75///
76/// # Examples
77///
78/// ```
79/// use fhp_core::entity::escape_attr;
80///
81/// let mut buf = String::new();
82/// escape_attr("x&y=\"z\"", &mut buf);
83/// assert_eq!(buf, "x&amp;y=&quot;z&quot;");
84/// ```
85#[inline]
86pub fn escape_attr(input: &str, out: &mut String) {
87    escape_impl::<true>(input, out);
88}
89
90/// Shared escape implementation. When `ESCAPE_QUOTES` is true, `"` and `'`
91/// are also escaped (for attribute values).
92#[inline(always)]
93fn escape_impl<const ESCAPE_QUOTES: bool>(input: &str, out: &mut String) {
94    out.reserve(input.len());
95
96    let bytes = input.as_bytes();
97    let mut last = 0;
98
99    for (i, &b) in bytes.iter().enumerate() {
100        let replacement = match b {
101            b'&' => "&amp;",
102            b'<' => "&lt;",
103            b'>' => "&gt;",
104            b'"' if ESCAPE_QUOTES => "&quot;",
105            b'\'' if ESCAPE_QUOTES => "&#39;",
106            _ => continue,
107        };
108
109        out.push_str(&input[last..i]);
110        out.push_str(replacement);
111        last = i + 1;
112    }
113
114    out.push_str(&input[last..]);
115}
116
117/// Compile-time perfect-hash map of the most common HTML named entities.
118///
119/// This covers the ~250 most-used entities. The full HTML5 spec defines
120/// ~2200, but the long tail is almost never seen in practice.
121static ENTITY_MAP: phf::Map<&'static str, &'static str> = phf::phf_map! {
122    // Most common
123    "amp"    => "&",
124    "lt"     => "<",
125    "gt"     => ">",
126    "quot"   => "\"",
127    "apos"   => "'",
128    "nbsp"   => "\u{00A0}",
129
130    // Latin supplement
131    "iexcl"  => "\u{00A1}",
132    "cent"   => "\u{00A2}",
133    "pound"  => "\u{00A3}",
134    "curren" => "\u{00A4}",
135    "yen"    => "\u{00A5}",
136    "brvbar" => "\u{00A6}",
137    "sect"   => "\u{00A7}",
138    "uml"    => "\u{00A8}",
139    "copy"   => "\u{00A9}",
140    "ordf"   => "\u{00AA}",
141    "laquo"  => "\u{00AB}",
142    "not"    => "\u{00AC}",
143    "shy"    => "\u{00AD}",
144    "reg"    => "\u{00AE}",
145    "macr"   => "\u{00AF}",
146    "deg"    => "\u{00B0}",
147    "plusmn" => "\u{00B1}",
148    "sup2"   => "\u{00B2}",
149    "sup3"   => "\u{00B3}",
150    "acute"  => "\u{00B4}",
151    "micro"  => "\u{00B5}",
152    "para"   => "\u{00B6}",
153    "middot" => "\u{00B7}",
154    "cedil"  => "\u{00B8}",
155    "sup1"   => "\u{00B9}",
156    "ordm"   => "\u{00BA}",
157    "raquo"  => "\u{00BB}",
158    "frac14" => "\u{00BC}",
159    "frac12" => "\u{00BD}",
160    "frac34" => "\u{00BE}",
161    "iquest" => "\u{00BF}",
162
163    // Accented Latin
164    "Agrave" => "\u{00C0}",
165    "Aacute" => "\u{00C1}",
166    "Acirc"  => "\u{00C2}",
167    "Atilde" => "\u{00C3}",
168    "Auml"   => "\u{00C4}",
169    "Aring"  => "\u{00C5}",
170    "AElig"  => "\u{00C6}",
171    "Ccedil" => "\u{00C7}",
172    "Egrave" => "\u{00C8}",
173    "Eacute" => "\u{00C9}",
174    "Ecirc"  => "\u{00CA}",
175    "Euml"   => "\u{00CB}",
176    "Igrave" => "\u{00CC}",
177    "Iacute" => "\u{00CD}",
178    "Icirc"  => "\u{00CE}",
179    "Iuml"   => "\u{00CF}",
180    "ETH"    => "\u{00D0}",
181    "Ntilde" => "\u{00D1}",
182    "Ograve" => "\u{00D2}",
183    "Oacute" => "\u{00D3}",
184    "Ocirc"  => "\u{00D4}",
185    "Otilde" => "\u{00D5}",
186    "Ouml"   => "\u{00D6}",
187    "times"  => "\u{00D7}",
188    "Oslash" => "\u{00D8}",
189    "Ugrave" => "\u{00D9}",
190    "Uacute" => "\u{00DA}",
191    "Ucirc"  => "\u{00DB}",
192    "Uuml"   => "\u{00DC}",
193    "Yacute" => "\u{00DD}",
194    "THORN"  => "\u{00DE}",
195    "szlig"  => "\u{00DF}",
196    "agrave" => "\u{00E0}",
197    "aacute" => "\u{00E1}",
198    "acirc"  => "\u{00E2}",
199    "atilde" => "\u{00E3}",
200    "auml"   => "\u{00E4}",
201    "aring"  => "\u{00E5}",
202    "aelig"  => "\u{00E6}",
203    "ccedil" => "\u{00E7}",
204    "egrave" => "\u{00E8}",
205    "eacute" => "\u{00E9}",
206    "ecirc"  => "\u{00EA}",
207    "euml"   => "\u{00EB}",
208    "igrave" => "\u{00EC}",
209    "iacute" => "\u{00ED}",
210    "icirc"  => "\u{00EE}",
211    "iuml"   => "\u{00EF}",
212    "eth"    => "\u{00F0}",
213    "ntilde" => "\u{00F1}",
214    "ograve" => "\u{00F2}",
215    "oacute" => "\u{00F3}",
216    "ocirc"  => "\u{00F4}",
217    "otilde" => "\u{00F5}",
218    "ouml"   => "\u{00F6}",
219    "divide" => "\u{00F7}",
220    "oslash" => "\u{00F8}",
221    "ugrave" => "\u{00F9}",
222    "uacute" => "\u{00FA}",
223    "ucirc"  => "\u{00FB}",
224    "uuml"   => "\u{00FC}",
225    "yacute" => "\u{00FD}",
226    "thorn"  => "\u{00FE}",
227    "yuml"   => "\u{00FF}",
228
229    // Greek
230    "Alpha"   => "\u{0391}",
231    "Beta"    => "\u{0392}",
232    "Gamma"   => "\u{0393}",
233    "Delta"   => "\u{0394}",
234    "Epsilon" => "\u{0395}",
235    "Zeta"    => "\u{0396}",
236    "Eta"     => "\u{0397}",
237    "Theta"   => "\u{0398}",
238    "Iota"    => "\u{0399}",
239    "Kappa"   => "\u{039A}",
240    "Lambda"  => "\u{039B}",
241    "Mu"      => "\u{039C}",
242    "Nu"      => "\u{039D}",
243    "Xi"      => "\u{039E}",
244    "Omicron" => "\u{039F}",
245    "Pi"      => "\u{03A0}",
246    "Rho"     => "\u{03A1}",
247    "Sigma"   => "\u{03A3}",
248    "Tau"     => "\u{03A4}",
249    "Upsilon" => "\u{03A5}",
250    "Phi"     => "\u{03A6}",
251    "Chi"     => "\u{03A7}",
252    "Psi"     => "\u{03A8}",
253    "Omega"   => "\u{03A9}",
254    "alpha"   => "\u{03B1}",
255    "beta"    => "\u{03B2}",
256    "gamma"   => "\u{03B3}",
257    "delta"   => "\u{03B4}",
258    "epsilon" => "\u{03B5}",
259    "zeta"    => "\u{03B6}",
260    "eta"     => "\u{03B7}",
261    "theta"   => "\u{03B8}",
262    "iota"    => "\u{03B9}",
263    "kappa"   => "\u{03BA}",
264    "lambda"  => "\u{03BB}",
265    "mu"      => "\u{03BC}",
266    "nu"      => "\u{03BD}",
267    "xi"      => "\u{03BE}",
268    "omicron" => "\u{03BF}",
269    "pi"      => "\u{03C0}",
270    "rho"     => "\u{03C1}",
271    "sigmaf"  => "\u{03C2}",
272    "sigma"   => "\u{03C3}",
273    "tau"     => "\u{03C4}",
274    "upsilon" => "\u{03C5}",
275    "phi"     => "\u{03C6}",
276    "chi"     => "\u{03C7}",
277    "psi"     => "\u{03C8}",
278    "omega"   => "\u{03C9}",
279
280    // Math / symbols
281    "bull"    => "\u{2022}",
282    "hellip"  => "\u{2026}",
283    "prime"   => "\u{2032}",
284    "Prime"   => "\u{2033}",
285    "oline"   => "\u{203E}",
286    "frasl"   => "\u{2044}",
287    "trade"   => "\u{2122}",
288    "larr"    => "\u{2190}",
289    "uarr"    => "\u{2191}",
290    "rarr"    => "\u{2192}",
291    "darr"    => "\u{2193}",
292    "harr"    => "\u{2194}",
293    "lArr"    => "\u{21D0}",
294    "uArr"    => "\u{21D1}",
295    "rArr"    => "\u{21D2}",
296    "dArr"    => "\u{21D3}",
297    "hArr"    => "\u{21D4}",
298    "forall"  => "\u{2200}",
299    "part"    => "\u{2202}",
300    "exist"   => "\u{2203}",
301    "empty"   => "\u{2205}",
302    "nabla"   => "\u{2207}",
303    "isin"    => "\u{2208}",
304    "notin"   => "\u{2209}",
305    "ni"      => "\u{220B}",
306    "prod"    => "\u{220F}",
307    "sum"     => "\u{2211}",
308    "minus"   => "\u{2212}",
309    "lowast"  => "\u{2217}",
310    "radic"   => "\u{221A}",
311    "prop"    => "\u{221D}",
312    "infin"   => "\u{221E}",
313    "ang"     => "\u{2220}",
314    "and"     => "\u{2227}",
315    "or"      => "\u{2228}",
316    "cap"     => "\u{2229}",
317    "cup"     => "\u{222A}",
318    "int"     => "\u{222B}",
319    "there4"  => "\u{2234}",
320    "sim"     => "\u{223C}",
321    "cong"    => "\u{2245}",
322    "asymp"   => "\u{2248}",
323    "ne"      => "\u{2260}",
324    "equiv"   => "\u{2261}",
325    "le"      => "\u{2264}",
326    "ge"      => "\u{2265}",
327    "sub"     => "\u{2282}",
328    "sup"     => "\u{2283}",
329    "nsub"    => "\u{2284}",
330    "sube"    => "\u{2286}",
331    "supe"    => "\u{2287}",
332    "oplus"   => "\u{2295}",
333    "otimes"  => "\u{2297}",
334    "perp"    => "\u{22A5}",
335    "sdot"    => "\u{22C5}",
336
337    // Punctuation / typography
338    "ensp"    => "\u{2002}",
339    "emsp"    => "\u{2003}",
340    "thinsp"  => "\u{2009}",
341    "zwnj"    => "\u{200C}",
342    "zwj"     => "\u{200D}",
343    "lrm"     => "\u{200E}",
344    "rlm"     => "\u{200F}",
345    "ndash"   => "\u{2013}",
346    "mdash"   => "\u{2014}",
347    "lsquo"   => "\u{2018}",
348    "rsquo"   => "\u{2019}",
349    "sbquo"   => "\u{201A}",
350    "ldquo"   => "\u{201C}",
351    "rdquo"   => "\u{201D}",
352    "bdquo"   => "\u{201E}",
353    "dagger"  => "\u{2020}",
354    "Dagger"  => "\u{2021}",
355    "permil"  => "\u{2030}",
356    "lsaquo"  => "\u{2039}",
357    "rsaquo"  => "\u{203A}",
358    "euro"    => "\u{20AC}",
359
360    // Miscellaneous
361    "OElig"   => "\u{0152}",
362    "oelig"   => "\u{0153}",
363    "Scaron"  => "\u{0160}",
364    "scaron"  => "\u{0161}",
365    "Yuml"    => "\u{0178}",
366    "circ"    => "\u{02C6}",
367    "tilde"   => "\u{02DC}",
368    "fnof"    => "\u{0192}",
369
370    // Card suits / misc symbols
371    "spades"  => "\u{2660}",
372    "clubs"   => "\u{2663}",
373    "hearts"  => "\u{2665}",
374    "diams"   => "\u{2666}",
375    "loz"     => "\u{25CA}",
376    "lceil"   => "\u{2308}",
377    "rceil"   => "\u{2309}",
378    "lfloor"  => "\u{230A}",
379    "rfloor"  => "\u{230B}",
380    "lang"    => "\u{2329}",
381    "rang"    => "\u{232A}",
382};
383
384#[cfg(test)]
385mod tests {
386    use super::*;
387
388    #[test]
389    fn common_named_entities() {
390        assert_eq!(decode_named("amp"), Some("&"));
391        assert_eq!(decode_named("lt"), Some("<"));
392        assert_eq!(decode_named("gt"), Some(">"));
393        assert_eq!(decode_named("quot"), Some("\""));
394        assert_eq!(decode_named("apos"), Some("'"));
395        assert_eq!(decode_named("nbsp"), Some("\u{00A0}"));
396    }
397
398    #[test]
399    fn unknown_entity() {
400        assert_eq!(decode_named("nonexistent"), None);
401        assert_eq!(decode_named(""), None);
402    }
403
404    #[test]
405    fn numeric_decimal() {
406        assert_eq!(decode_numeric("60", false), Some('<'));
407        assert_eq!(decode_numeric("62", false), Some('>'));
408        assert_eq!(decode_numeric("38", false), Some('&'));
409        assert_eq!(decode_numeric("128512", false), Some('\u{1F600}'));
410    }
411
412    #[test]
413    fn numeric_hex() {
414        assert_eq!(decode_numeric("3C", true), Some('<'));
415        assert_eq!(decode_numeric("3e", true), Some('>'));
416        assert_eq!(decode_numeric("1F600", true), Some('\u{1F600}'));
417    }
418
419    #[test]
420    fn numeric_null_replaced() {
421        assert_eq!(decode_numeric("0", false), Some('\u{FFFD}'));
422        assert_eq!(decode_numeric("0", true), Some('\u{FFFD}'));
423    }
424
425    #[test]
426    fn numeric_invalid() {
427        assert_eq!(decode_numeric("FFFFFF", true), None); // > U+10FFFF
428        assert_eq!(decode_numeric("abc", false), None); // not decimal
429        assert_eq!(decode_numeric("", false), None);
430    }
431
432    #[test]
433    fn greek_entities() {
434        assert_eq!(decode_named("alpha"), Some("\u{03B1}"));
435        assert_eq!(decode_named("omega"), Some("\u{03C9}"));
436        assert_eq!(decode_named("Sigma"), Some("\u{03A3}"));
437    }
438
439    #[test]
440    fn typography_entities() {
441        assert_eq!(decode_named("mdash"), Some("\u{2014}"));
442        assert_eq!(decode_named("euro"), Some("\u{20AC}"));
443        assert_eq!(decode_named("trade"), Some("\u{2122}"));
444    }
445
446    // ---- escape_text tests ----
447
448    #[test]
449    fn escape_text_special_chars() {
450        let mut buf = String::new();
451        escape_text("&", &mut buf);
452        assert_eq!(buf, "&amp;");
453
454        buf.clear();
455        escape_text("<", &mut buf);
456        assert_eq!(buf, "&lt;");
457
458        buf.clear();
459        escape_text(">", &mut buf);
460        assert_eq!(buf, "&gt;");
461    }
462
463    #[test]
464    fn escape_text_mixed() {
465        let mut buf = String::new();
466        escape_text("1 < 2 & 3 > 0", &mut buf);
467        assert_eq!(buf, "1 &lt; 2 &amp; 3 &gt; 0");
468    }
469
470    #[test]
471    fn escape_text_plain() {
472        let mut buf = String::new();
473        escape_text("hello world", &mut buf);
474        assert_eq!(buf, "hello world");
475    }
476
477    #[test]
478    fn escape_text_empty() {
479        let mut buf = String::new();
480        escape_text("", &mut buf);
481        assert_eq!(buf, "");
482    }
483
484    #[test]
485    fn escape_text_all_special() {
486        let mut buf = String::new();
487        escape_text("&<>", &mut buf);
488        assert_eq!(buf, "&amp;&lt;&gt;");
489    }
490
491    // ---- escape_attr tests ----
492
493    #[test]
494    fn escape_attr_quote() {
495        let mut buf = String::new();
496        escape_attr("say \"hello\"", &mut buf);
497        assert_eq!(buf, "say &quot;hello&quot;");
498    }
499
500    #[test]
501    fn escape_attr_mixed() {
502        let mut buf = String::new();
503        escape_attr("x&y=\"z\"", &mut buf);
504        assert_eq!(buf, "x&amp;y=&quot;z&quot;");
505    }
506
507    #[test]
508    fn escape_attr_plain() {
509        let mut buf = String::new();
510        escape_attr("plain", &mut buf);
511        assert_eq!(buf, "plain");
512    }
513
514    #[test]
515    fn escape_attr_single_quote() {
516        let mut buf = String::new();
517        escape_attr("it's", &mut buf);
518        assert_eq!(buf, "it&#39;s");
519    }
520
521    #[test]
522    fn escape_attr_empty() {
523        let mut buf = String::new();
524        escape_attr("", &mut buf);
525        assert_eq!(buf, "");
526    }
527
528    #[test]
529    fn escape_text_does_not_escape_quotes() {
530        let mut buf = String::new();
531        escape_text("say \"hello\" it's", &mut buf);
532        assert_eq!(buf, "say \"hello\" it's");
533    }
534}