streamdown_parser/
entities.rs1use std::collections::HashMap;
4use std::sync::LazyLock;
5
6static HTML_ENTITIES: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
8 let mut m = HashMap::new();
9 m.insert("©", "©");
11 m.insert("™", "™");
12 m.insert("®", "®");
13 m.insert("&", "&");
15 m.insert("<", "<");
16 m.insert(">", ">");
17 m.insert(""", "\"");
18 m.insert("'", "'");
19 m.insert(" ", " ");
20 m.insert("—", "—");
22 m.insert("–", "–");
23 m.insert("…", "…");
24 m.insert("←", "←");
26 m.insert("→", "→");
27 m.insert("↑", "↑");
28 m.insert("↓", "↓");
29 m.insert("×", "×");
31 m.insert("÷", "÷");
32 m.insert("±", "±");
33 m.insert("≠", "≠");
34 m.insert("≤", "≤");
35 m.insert("≥", "≥");
36 m.insert("∞", "∞");
37 m.insert("€", "€");
39 m.insert("£", "£");
40 m.insert("¥", "¥");
41 m.insert("¢", "¢");
42 m.insert("°", "°");
44 m.insert("¶", "¶");
45 m.insert("§", "§");
46 m.insert("•", "•");
47 m.insert("·", "·");
48 m.insert("«", "«");
49 m.insert("»", "»");
50 m.insert("†", "†");
51 m.insert("‡", "‡");
52 m.insert("‰", "‰");
53 m.insert("′", "′");
54 m.insert("″", "″");
55 m
56});
57
58pub fn decode_html_entities(text: &str) -> String {
60 let mut result = text.to_string();
61
62 for (entity, replacement) in HTML_ENTITIES.iter() {
64 result = result.replace(entity, replacement);
65 }
66
67 while let Some(start) = result.find("&#") {
70 if let Some(end) = result[start..].find(';') {
71 let entity = &result[start..start + end + 1];
72 let num_str = &entity[2..entity.len() - 1];
73
74 let codepoint = if num_str.starts_with('x') || num_str.starts_with('X') {
75 u32::from_str_radix(&num_str[1..], 16).ok()
77 } else {
78 num_str.parse::<u32>().ok()
80 };
81
82 if let Some(cp) = codepoint {
83 if let Some(c) = char::from_u32(cp) {
84 result = result.replace(entity, &c.to_string());
85 continue;
86 }
87 }
88 }
89 break;
91 }
92
93 result
94}
95
96#[cfg(test)]
97mod tests {
98 use super::*;
99
100 #[test]
101 fn test_named_entities() {
102 assert_eq!(decode_html_entities("©"), "©");
103 assert_eq!(decode_html_entities("™"), "™");
104 assert_eq!(decode_html_entities("®"), "®");
105 assert_eq!(decode_html_entities("&"), "&");
106 }
107
108 #[test]
109 fn test_numeric_entities() {
110 assert_eq!(decode_html_entities("©"), "©");
111 assert_eq!(decode_html_entities("©"), "©");
112 }
113
114 #[test]
115 fn test_mixed() {
116 assert_eq!(
117 decode_html_entities("Copyright © 2024"),
118 "Copyright © 2024"
119 );
120 }
121}