streamdown_parser/
entities.rs

1//! HTML entity decoding
2
3use std::collections::HashMap;
4use std::sync::LazyLock;
5
6/// Common HTML entities mapping
7static HTML_ENTITIES: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
8    let mut m = HashMap::new();
9    // Copyright, trademark, registered
10    m.insert("&copy;", "©");
11    m.insert("&trade;", "™");
12    m.insert("&reg;", "®");
13    // Common symbols
14    m.insert("&amp;", "&");
15    m.insert("&lt;", "<");
16    m.insert("&gt;", ">");
17    m.insert("&quot;", "\"");
18    m.insert("&apos;", "'");
19    m.insert("&nbsp;", " ");
20    // Dashes and spaces
21    m.insert("&mdash;", "—");
22    m.insert("&ndash;", "–");
23    m.insert("&hellip;", "…");
24    // Arrows
25    m.insert("&larr;", "←");
26    m.insert("&rarr;", "→");
27    m.insert("&uarr;", "↑");
28    m.insert("&darr;", "↓");
29    // Math
30    m.insert("&times;", "×");
31    m.insert("&divide;", "÷");
32    m.insert("&plusmn;", "±");
33    m.insert("&ne;", "≠");
34    m.insert("&le;", "≤");
35    m.insert("&ge;", "≥");
36    m.insert("&infin;", "∞");
37    // Currency
38    m.insert("&euro;", "€");
39    m.insert("&pound;", "£");
40    m.insert("&yen;", "¥");
41    m.insert("&cent;", "¢");
42    // Other common
43    m.insert("&deg;", "°");
44    m.insert("&para;", "¶");
45    m.insert("&sect;", "§");
46    m.insert("&bull;", "•");
47    m.insert("&middot;", "·");
48    m.insert("&laquo;", "«");
49    m.insert("&raquo;", "»");
50    m.insert("&dagger;", "†");
51    m.insert("&Dagger;", "‡");
52    m.insert("&permil;", "‰");
53    m.insert("&prime;", "′");
54    m.insert("&Prime;", "″");
55    m
56});
57
58/// Decode HTML entities in a string
59pub fn decode_html_entities(text: &str) -> String {
60    let mut result = text.to_string();
61
62    // Replace named entities
63    for (entity, replacement) in HTML_ENTITIES.iter() {
64        result = result.replace(entity, replacement);
65    }
66
67    // Handle numeric entities like &#169; or &#x00A9;
68    // Decimal: &#123;
69    while let Some(start) = result.find("&#") {
70        if let Some(end) = result[start..].find(';') {
71            let entity = &result[start..start + end + 1];
72            let num_str = &entity[2..entity.len() - 1];
73
74            let codepoint = if num_str.starts_with('x') || num_str.starts_with('X') {
75                // Hex: &#x00A9;
76                u32::from_str_radix(&num_str[1..], 16).ok()
77            } else {
78                // Decimal: &#169;
79                num_str.parse::<u32>().ok()
80            };
81
82            if let Some(cp) = codepoint {
83                if let Some(c) = char::from_u32(cp) {
84                    result = result.replace(entity, &c.to_string());
85                    continue;
86                }
87            }
88        }
89        // If we couldn't parse it, break to avoid infinite loop
90        break;
91    }
92
93    result
94}
95
96#[cfg(test)]
97mod tests {
98    use super::*;
99
100    #[test]
101    fn test_named_entities() {
102        assert_eq!(decode_html_entities("&copy;"), "©");
103        assert_eq!(decode_html_entities("&trade;"), "™");
104        assert_eq!(decode_html_entities("&reg;"), "®");
105        assert_eq!(decode_html_entities("&amp;"), "&");
106    }
107
108    #[test]
109    fn test_numeric_entities() {
110        assert_eq!(decode_html_entities("&#169;"), "©");
111        assert_eq!(decode_html_entities("&#x00A9;"), "©");
112    }
113
114    #[test]
115    fn test_mixed() {
116        assert_eq!(
117            decode_html_entities("Copyright &copy; 2024"),
118            "Copyright © 2024"
119        );
120    }
121}