1use regex::Regex;
2
3pub fn html_to_text(html: &str) -> String {
9 html_to_text_impl(html)
10}
11
12pub fn html_to_text_impl(html: &str) -> String {
14 let mut text = html.to_string();
15
16 let re_head = Regex::new(r"(?is)<head[\s>].*?</head>").unwrap();
18 text = re_head.replace_all(&text, "").to_string();
19 let re_script = Regex::new(r"(?is)<script[\s>].*?</script>").unwrap();
20 text = re_script.replace_all(&text, "").to_string();
21 let re_style = Regex::new(r"(?is)<style[\s>].*?</style>").unwrap();
22 text = re_style.replace_all(&text, "").to_string();
23 let re_comment = Regex::new(r"(?s)<!--.*?-->").unwrap();
24 text = re_comment.replace_all(&text, "").to_string();
25
26 for level in 1..=6usize {
28 let pattern = format!(r"(?is)<h{0}\b[^>]*>(.*?)</h{0}\s*>", level);
29 let re = Regex::new(&pattern).unwrap();
30 let prefix = "#".repeat(level);
31 text = re
32 .replace_all(&text, |caps: ®ex::Captures| {
33 format!("\n{} {}\n", prefix, &caps[1])
34 })
35 .to_string();
36 }
37
38 let re_li = Regex::new(r"(?i)<li\b[^>]*>").unwrap();
40 text = re_li.replace_all(&text, "\n- ").to_string();
41
42 let re_b = Regex::new(r"(?is)<b\b[^>]*>(.*?)</b\s*>").unwrap();
44 text = re_b.replace_all(&text, "**$1**").to_string();
45 let re_strong = Regex::new(r"(?is)<strong\b[^>]*>(.*?)</strong\s*>").unwrap();
46 text = re_strong.replace_all(&text, "**$1**").to_string();
47
48 let re_img_alt = Regex::new(r#"(?i)<img\b[^>]*\balt=["']([^"']*)["'][^>]*/?\s*>"#).unwrap();
50 text = re_img_alt.replace_all(&text, "[image: $1]").to_string();
51 let re_img_no_alt = Regex::new(r"(?i)<img\b[^>]*/?\s*>").unwrap();
52 text = re_img_no_alt.replace_all(&text, "").to_string();
53
54 let re_cell = Regex::new(r"(?i)<(td|th)\b[^>]*>").unwrap();
56 text = re_cell.replace_all(&text, "\t").to_string();
57
58 let re_br = Regex::new(r"(?i)<br\b[^>]*/?\s*>").unwrap();
60 text = re_br.replace_all(&text, "\n").to_string();
61
62 let re_block_open = Regex::new(
64 r"(?i)<(p|div|section|article|table|tr|ul|ol|blockquote|header|footer|nav|main|aside|figure|figcaption|details|summary|dl|dt|dd|pre|address)\b[^>]*>",
65 )
66 .unwrap();
67 text = re_block_open.replace_all(&text, "\n").to_string();
68 let re_block_close = Regex::new(
69 r"(?i)</(p|div|section|article|table|tr|ul|ol|blockquote|header|footer|nav|main|aside|figure|figcaption|details|summary|dl|dt|dd|pre|address|h[1-6])>",
70 )
71 .unwrap();
72 text = re_block_close.replace_all(&text, "\n").to_string();
73
74 let re_tags = Regex::new(r"<[^>]+>").unwrap();
76 text = re_tags.replace_all(&text, "").to_string();
77
78 text = decode_entities(&text);
80
81 let re_hspace = Regex::new(r"[^\S\n]+").unwrap();
84 text = re_hspace.replace_all(&text, " ").to_string();
85 let re_blanks = Regex::new(r"\n{3,}").unwrap();
87 text = re_blanks.replace_all(&text, "\n\n").to_string();
88 let trimmed: Vec<&str> = text.lines().map(|l| l.trim()).collect();
90 text = trimmed.join("\n");
91
92 text.trim().to_string()
93}
94
95fn decode_entities(text: &str) -> String {
97 let mut s = text.to_string();
98
99 let entities: &[(&str, &str)] = &[
101 ("<", "<"),
102 (">", ">"),
103 (""", "\""),
104 ("'", "'"),
105 ("'", "'"),
106 (" ", " "),
107 ("—", "\u{2014}"),
109 ("–", "\u{2013}"),
110 ("«", "\u{00AB}"),
111 ("»", "\u{00BB}"),
112 ("…", "\u{2026}"),
113 ("•", "\u{2022}"),
114 ("‘", "\u{2018}"),
115 ("’", "\u{2019}"),
116 ("“", "\u{201C}"),
117 ("”", "\u{201D}"),
118 ("©", "\u{00A9}"),
119 ("®", "\u{00AE}"),
120 ("™", "\u{2122}"),
121 ("§", "\u{00A7}"),
122 ("¶", "\u{00B6}"),
123 ("°", "\u{00B0}"),
124 ("×", "\u{00D7}"),
125 ("÷", "\u{00F7}"),
126 ("½", "\u{00BD}"),
127 ("¼", "\u{00BC}"),
128 ("¾", "\u{00BE}"),
129 ("±", "\u{00B1}"),
130 ("µ", "\u{00B5}"),
131 ("æ", "\u{00E6}"),
133 ("Æ", "\u{00C6}"),
134 ("ø", "\u{00F8}"),
135 ("Ø", "\u{00D8}"),
136 ("å", "\u{00E5}"),
137 ("Å", "\u{00C5}"),
138 ("ä", "\u{00E4}"),
139 ("Ä", "\u{00C4}"),
140 ("ö", "\u{00F6}"),
141 ("Ö", "\u{00D6}"),
142 ("ü", "\u{00FC}"),
143 ("Ü", "\u{00DC}"),
144 ("ß", "\u{00DF}"),
145 ("ñ", "\u{00F1}"),
146 ("Ñ", "\u{00D1}"),
147 ("ç", "\u{00E7}"),
148 ("Ç", "\u{00C7}"),
149 ("é", "\u{00E9}"),
150 ("É", "\u{00C9}"),
151 ("è", "\u{00E8}"),
152 ("È", "\u{00C8}"),
153 ("ê", "\u{00EA}"),
154 ("Ê", "\u{00CA}"),
155 ("à", "\u{00E0}"),
156 ("À", "\u{00C0}"),
157 ("á", "\u{00E1}"),
158 ("Á", "\u{00C1}"),
159 ("â", "\u{00E2}"),
160 ("Â", "\u{00C2}"),
161 ("í", "\u{00ED}"),
162 ("Í", "\u{00CD}"),
163 ("ì", "\u{00EC}"),
164 ("Ì", "\u{00CC}"),
165 ("ô", "\u{00F4}"),
166 ("Ô", "\u{00D4}"),
167 ("ó", "\u{00F3}"),
168 ("Ó", "\u{00D3}"),
169 ("ò", "\u{00F2}"),
170 ("Ò", "\u{00D2}"),
171 ("ú", "\u{00FA}"),
172 ("Ú", "\u{00DA}"),
173 ("ù", "\u{00F9}"),
174 ("Ù", "\u{00D9}"),
175 ];
176 for &(entity, replacement) in entities {
177 s = s.replace(entity, replacement);
178 }
179
180 let re_dec = Regex::new(r"&#(\d+);").unwrap();
182 s = re_dec
183 .replace_all(&s, |caps: ®ex::Captures| {
184 caps[1]
185 .parse::<u32>()
186 .ok()
187 .and_then(char::from_u32)
188 .map(|c| c.to_string())
189 .unwrap_or_default()
190 })
191 .to_string();
192
193 let re_hex = Regex::new(r"(?i)&#x([0-9a-f]+);").unwrap();
195 s = re_hex
196 .replace_all(&s, |caps: ®ex::Captures| {
197 u32::from_str_radix(&caps[1], 16)
198 .ok()
199 .and_then(char::from_u32)
200 .map(|c| c.to_string())
201 .unwrap_or_default()
202 })
203 .to_string();
204
205 s = s.replace("&", "&");
207
208 s
209}
210
211#[cfg(test)]
212mod tests {
213 use super::*;
214
215 #[test]
216 fn test_strip_head() {
217 let html =
218 "<html><head><title>T</title><style>body{}</style></head><body>Hello</body></html>";
219 let result = html_to_text_impl(html);
220 assert_eq!(result, "Hello");
221 }
222
223 #[test]
224 fn test_headings() {
225 let html = "<h1>Title</h1><h2>Sub</h2><p>Text</p>";
226 let result = html_to_text_impl(html);
227 assert!(result.contains("# Title"));
228 assert!(result.contains("## Sub"));
229 assert!(result.contains("Text"));
230 }
231
232 #[test]
233 fn test_list_items() {
234 let html = "<ul><li>Alpha</li><li>Beta</li></ul>";
235 let result = html_to_text_impl(html);
236 assert!(result.contains("- Alpha"));
237 assert!(result.contains("- Beta"));
238 }
239
240 #[test]
241 fn test_bold() {
242 let html = "<p>Hello <strong>world</strong> and <b>rust</b></p>";
243 let result = html_to_text_impl(html);
244 assert!(result.contains("**world**"));
245 assert!(result.contains("**rust**"));
246 }
247
248 #[test]
249 fn test_images() {
250 let html = r#"<img alt="logo" src="logo.png"><img src="spacer.gif">"#;
251 let result = html_to_text_impl(html);
252 assert!(result.contains("[image: logo]"));
253 assert!(!result.contains("spacer"));
254 }
255
256 #[test]
257 fn test_tables() {
258 let html =
259 "<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>";
260 let result = html_to_text_impl(html);
261 assert!(result.contains("Name"));
262 assert!(result.contains("Age"));
263 assert!(result.contains("Alice"));
264 assert!(result.contains("30"));
265 }
266
267 #[test]
268 fn test_entities() {
269 let html = "<p><tag> & "quotes" © §</p>";
270 let result = html_to_text_impl(html);
271 assert!(result.contains("<tag>"));
272 assert!(result.contains("& \"quotes\""));
273 assert!(result.contains("\u{00A9}")); assert!(result.contains("\u{00A7}")); }
276
277 #[test]
278 fn test_double_encoded_entities() {
279 let html = "<p>&lt; should stay as &lt;</p>";
280 let result = html_to_text_impl(html);
281 assert!(result.contains("<"));
282 }
283
284 #[test]
285 fn test_script_style_removed() {
286 let html =
287 "<p>Before</p><script>alert('xss')</script><style>.a{color:red}</style><p>After</p>";
288 let result = html_to_text_impl(html);
289 assert!(result.contains("Before"));
290 assert!(result.contains("After"));
291 assert!(!result.contains("alert"));
292 assert!(!result.contains("color"));
293 }
294
295 #[test]
296 fn test_comments_removed() {
297 let html = "<p>A<!-- hidden -->B</p>";
298 let result = html_to_text_impl(html);
299 assert!(result.contains("AB") || result.contains("A B"));
300 assert!(!result.contains("hidden"));
301 }
302
303 #[test]
304 fn test_links_stripped() {
305 let html = r#"<a href="https://example.com">click here</a>"#;
306 let result = html_to_text_impl(html);
307 assert!(result.contains("click here"));
308 assert!(!result.contains("https://"));
309 }
310
311 #[test]
312 fn test_whitespace_collapsed() {
313 let html = "<p> lots of spaces </p>\n\n\n\n<p>after gap</p>";
314 let result = html_to_text_impl(html);
315 assert!(!result.contains(" "));
316 assert!(!result.contains("\n\n\n"));
318 }
319
320 #[test]
321 fn test_br_tags() {
322 let html = "line1<br>line2<br/>line3<br />line4";
323 let result = html_to_text_impl(html);
324 assert!(result.contains("line1\nline2"));
325 assert!(result.contains("line3"));
326 assert!(result.contains("line4"));
327 }
328}