Skip to main content

mcp_methods/
html.rs

1use regex::Regex;
2
3/// Convert HTML to clean, readable text optimized for LLM consumption.
4///
5/// Strips tags, converts headings to markdown `#` prefixes, list items to
6/// `- ` bullets, bold to `**text**`, images to `[image: alt]`, tables to
7/// tab-separated text, and decodes HTML entities.
8pub fn html_to_text(html: &str) -> String {
9    html_to_text_impl(html)
10}
11
12/// Core implementation shared by the standalone function and read_file transform.
13pub fn html_to_text_impl(html: &str) -> String {
14    let mut text = html.to_string();
15
16    // ── 1. Remove non-content sections ────────────────────────────────
17    let re_head = Regex::new(r"(?is)<head[\s>].*?</head>").unwrap();
18    text = re_head.replace_all(&text, "").to_string();
19    let re_script = Regex::new(r"(?is)<script[\s>].*?</script>").unwrap();
20    text = re_script.replace_all(&text, "").to_string();
21    let re_style = Regex::new(r"(?is)<style[\s>].*?</style>").unwrap();
22    text = re_style.replace_all(&text, "").to_string();
23    let re_comment = Regex::new(r"(?s)<!--.*?-->").unwrap();
24    text = re_comment.replace_all(&text, "").to_string();
25
26    // ── 2. Headings → markdown # prefix ───────────────────────────────
27    for level in 1..=6usize {
28        let pattern = format!(r"(?is)<h{0}\b[^>]*>(.*?)</h{0}\s*>", level);
29        let re = Regex::new(&pattern).unwrap();
30        let prefix = "#".repeat(level);
31        text = re
32            .replace_all(&text, |caps: &regex::Captures| {
33                format!("\n{} {}\n", prefix, &caps[1])
34            })
35            .to_string();
36    }
37
38    // ── 3. List items → "- " prefix ──────────────────────────────────
39    let re_li = Regex::new(r"(?i)<li\b[^>]*>").unwrap();
40    text = re_li.replace_all(&text, "\n- ").to_string();
41
42    // ── 4. Bold / strong → **text** ──────────────────────────────────
43    let re_b = Regex::new(r"(?is)<b\b[^>]*>(.*?)</b\s*>").unwrap();
44    text = re_b.replace_all(&text, "**$1**").to_string();
45    let re_strong = Regex::new(r"(?is)<strong\b[^>]*>(.*?)</strong\s*>").unwrap();
46    text = re_strong.replace_all(&text, "**$1**").to_string();
47
48    // ── 5. Images → [image: alt_text] ────────────────────────────────
49    let re_img_alt = Regex::new(r#"(?i)<img\b[^>]*\balt=["']([^"']*)["'][^>]*/?\s*>"#).unwrap();
50    text = re_img_alt.replace_all(&text, "[image: $1]").to_string();
51    let re_img_no_alt = Regex::new(r"(?i)<img\b[^>]*/?\s*>").unwrap();
52    text = re_img_no_alt.replace_all(&text, "").to_string();
53
54    // ── 6. Table cells → tab separator ───────────────────────────────
55    let re_cell = Regex::new(r"(?i)<(td|th)\b[^>]*>").unwrap();
56    text = re_cell.replace_all(&text, "\t").to_string();
57
58    // ── 7. <br> → newline ────────────────────────────────────────────
59    let re_br = Regex::new(r"(?i)<br\b[^>]*/?\s*>").unwrap();
60    text = re_br.replace_all(&text, "\n").to_string();
61
62    // ── 8. Block-level tags → newline ────────────────────────────────
63    let re_block_open = Regex::new(
64        r"(?i)<(p|div|section|article|table|tr|ul|ol|blockquote|header|footer|nav|main|aside|figure|figcaption|details|summary|dl|dt|dd|pre|address)\b[^>]*>",
65    )
66    .unwrap();
67    text = re_block_open.replace_all(&text, "\n").to_string();
68    let re_block_close = Regex::new(
69        r"(?i)</(p|div|section|article|table|tr|ul|ol|blockquote|header|footer|nav|main|aside|figure|figcaption|details|summary|dl|dt|dd|pre|address|h[1-6])>",
70    )
71    .unwrap();
72    text = re_block_close.replace_all(&text, "\n").to_string();
73
74    // ── 9. Strip remaining HTML tags ─────────────────────────────────
75    let re_tags = Regex::new(r"<[^>]+>").unwrap();
76    text = re_tags.replace_all(&text, "").to_string();
77
78    // ── 10. Decode HTML entities ─────────────────────────────────────
79    text = decode_entities(&text);
80
81    // ── 11. Collapse whitespace ──────────────────────────────────────
82    // Horizontal whitespace → single space (preserve newlines)
83    let re_hspace = Regex::new(r"[^\S\n]+").unwrap();
84    text = re_hspace.replace_all(&text, " ").to_string();
85    // 3+ consecutive newlines → 2
86    let re_blanks = Regex::new(r"\n{3,}").unwrap();
87    text = re_blanks.replace_all(&text, "\n\n").to_string();
88    // Trim each line
89    let trimmed: Vec<&str> = text.lines().map(|l| l.trim()).collect();
90    text = trimmed.join("\n");
91
92    text.trim().to_string()
93}
94
95/// Decode common HTML entities and numeric character references.
96fn decode_entities(text: &str) -> String {
97    let mut s = text.to_string();
98
99    // Named entities (decode &amp; LAST to avoid double-decoding)
100    let entities: &[(&str, &str)] = &[
101        ("&lt;", "<"),
102        ("&gt;", ">"),
103        ("&quot;", "\""),
104        ("&#39;", "'"),
105        ("&apos;", "'"),
106        ("&nbsp;", " "),
107        // Punctuation & symbols
108        ("&mdash;", "\u{2014}"),
109        ("&ndash;", "\u{2013}"),
110        ("&laquo;", "\u{00AB}"),
111        ("&raquo;", "\u{00BB}"),
112        ("&hellip;", "\u{2026}"),
113        ("&bull;", "\u{2022}"),
114        ("&lsquo;", "\u{2018}"),
115        ("&rsquo;", "\u{2019}"),
116        ("&ldquo;", "\u{201C}"),
117        ("&rdquo;", "\u{201D}"),
118        ("&copy;", "\u{00A9}"),
119        ("&reg;", "\u{00AE}"),
120        ("&trade;", "\u{2122}"),
121        ("&sect;", "\u{00A7}"),
122        ("&para;", "\u{00B6}"),
123        ("&deg;", "\u{00B0}"),
124        ("&times;", "\u{00D7}"),
125        ("&divide;", "\u{00F7}"),
126        ("&frac12;", "\u{00BD}"),
127        ("&frac14;", "\u{00BC}"),
128        ("&frac34;", "\u{00BE}"),
129        ("&plusmn;", "\u{00B1}"),
130        ("&micro;", "\u{00B5}"),
131        // European / Scandinavian letters
132        ("&aelig;", "\u{00E6}"),
133        ("&AElig;", "\u{00C6}"),
134        ("&oslash;", "\u{00F8}"),
135        ("&Oslash;", "\u{00D8}"),
136        ("&aring;", "\u{00E5}"),
137        ("&Aring;", "\u{00C5}"),
138        ("&auml;", "\u{00E4}"),
139        ("&Auml;", "\u{00C4}"),
140        ("&ouml;", "\u{00F6}"),
141        ("&Ouml;", "\u{00D6}"),
142        ("&uuml;", "\u{00FC}"),
143        ("&Uuml;", "\u{00DC}"),
144        ("&szlig;", "\u{00DF}"),
145        ("&ntilde;", "\u{00F1}"),
146        ("&Ntilde;", "\u{00D1}"),
147        ("&ccedil;", "\u{00E7}"),
148        ("&Ccedil;", "\u{00C7}"),
149        ("&eacute;", "\u{00E9}"),
150        ("&Eacute;", "\u{00C9}"),
151        ("&egrave;", "\u{00E8}"),
152        ("&Egrave;", "\u{00C8}"),
153        ("&ecirc;", "\u{00EA}"),
154        ("&Ecirc;", "\u{00CA}"),
155        ("&agrave;", "\u{00E0}"),
156        ("&Agrave;", "\u{00C0}"),
157        ("&aacute;", "\u{00E1}"),
158        ("&Aacute;", "\u{00C1}"),
159        ("&acirc;", "\u{00E2}"),
160        ("&Acirc;", "\u{00C2}"),
161        ("&iacute;", "\u{00ED}"),
162        ("&Iacute;", "\u{00CD}"),
163        ("&igrave;", "\u{00EC}"),
164        ("&Igrave;", "\u{00CC}"),
165        ("&ocirc;", "\u{00F4}"),
166        ("&Ocirc;", "\u{00D4}"),
167        ("&oacute;", "\u{00F3}"),
168        ("&Oacute;", "\u{00D3}"),
169        ("&ograve;", "\u{00F2}"),
170        ("&Ograve;", "\u{00D2}"),
171        ("&uacute;", "\u{00FA}"),
172        ("&Uacute;", "\u{00DA}"),
173        ("&ugrave;", "\u{00F9}"),
174        ("&Ugrave;", "\u{00D9}"),
175    ];
176    for &(entity, replacement) in entities {
177        s = s.replace(entity, replacement);
178    }
179
180    // Numeric: &#123;
181    let re_dec = Regex::new(r"&#(\d+);").unwrap();
182    s = re_dec
183        .replace_all(&s, |caps: &regex::Captures| {
184            caps[1]
185                .parse::<u32>()
186                .ok()
187                .and_then(char::from_u32)
188                .map(|c| c.to_string())
189                .unwrap_or_default()
190        })
191        .to_string();
192
193    // Hex: &#x1F;
194    let re_hex = Regex::new(r"(?i)&#x([0-9a-f]+);").unwrap();
195    s = re_hex
196        .replace_all(&s, |caps: &regex::Captures| {
197            u32::from_str_radix(&caps[1], 16)
198                .ok()
199                .and_then(char::from_u32)
200                .map(|c| c.to_string())
201                .unwrap_or_default()
202        })
203        .to_string();
204
205    // &amp; decoded last to prevent double-decoding (&amp;lt; → &lt; not <)
206    s = s.replace("&amp;", "&");
207
208    s
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn test_strip_head() {
217        let html =
218            "<html><head><title>T</title><style>body{}</style></head><body>Hello</body></html>";
219        let result = html_to_text_impl(html);
220        assert_eq!(result, "Hello");
221    }
222
223    #[test]
224    fn test_headings() {
225        let html = "<h1>Title</h1><h2>Sub</h2><p>Text</p>";
226        let result = html_to_text_impl(html);
227        assert!(result.contains("# Title"));
228        assert!(result.contains("## Sub"));
229        assert!(result.contains("Text"));
230    }
231
232    #[test]
233    fn test_list_items() {
234        let html = "<ul><li>Alpha</li><li>Beta</li></ul>";
235        let result = html_to_text_impl(html);
236        assert!(result.contains("- Alpha"));
237        assert!(result.contains("- Beta"));
238    }
239
240    #[test]
241    fn test_bold() {
242        let html = "<p>Hello <strong>world</strong> and <b>rust</b></p>";
243        let result = html_to_text_impl(html);
244        assert!(result.contains("**world**"));
245        assert!(result.contains("**rust**"));
246    }
247
248    #[test]
249    fn test_images() {
250        let html = r#"<img alt="logo" src="logo.png"><img src="spacer.gif">"#;
251        let result = html_to_text_impl(html);
252        assert!(result.contains("[image: logo]"));
253        assert!(!result.contains("spacer"));
254    }
255
256    #[test]
257    fn test_tables() {
258        let html =
259            "<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>";
260        let result = html_to_text_impl(html);
261        assert!(result.contains("Name"));
262        assert!(result.contains("Age"));
263        assert!(result.contains("Alice"));
264        assert!(result.contains("30"));
265    }
266
267    #[test]
268    fn test_entities() {
269        let html = "<p>&lt;tag&gt; &amp; &quot;quotes&quot; &#169; &#x00A7;</p>";
270        let result = html_to_text_impl(html);
271        assert!(result.contains("<tag>"));
272        assert!(result.contains("& \"quotes\""));
273        assert!(result.contains("\u{00A9}")); // ©
274        assert!(result.contains("\u{00A7}")); // §
275    }
276
277    #[test]
278    fn test_double_encoded_entities() {
279        let html = "<p>&amp;lt; should stay as &amp;lt;</p>";
280        let result = html_to_text_impl(html);
281        assert!(result.contains("&lt;"));
282    }
283
284    #[test]
285    fn test_script_style_removed() {
286        let html =
287            "<p>Before</p><script>alert('xss')</script><style>.a{color:red}</style><p>After</p>";
288        let result = html_to_text_impl(html);
289        assert!(result.contains("Before"));
290        assert!(result.contains("After"));
291        assert!(!result.contains("alert"));
292        assert!(!result.contains("color"));
293    }
294
295    #[test]
296    fn test_comments_removed() {
297        let html = "<p>A<!-- hidden -->B</p>";
298        let result = html_to_text_impl(html);
299        assert!(result.contains("AB") || result.contains("A B"));
300        assert!(!result.contains("hidden"));
301    }
302
303    #[test]
304    fn test_links_stripped() {
305        let html = r#"<a href="https://example.com">click here</a>"#;
306        let result = html_to_text_impl(html);
307        assert!(result.contains("click here"));
308        assert!(!result.contains("https://"));
309    }
310
311    #[test]
312    fn test_whitespace_collapsed() {
313        let html = "<p>  lots   of   spaces  </p>\n\n\n\n<p>after gap</p>";
314        let result = html_to_text_impl(html);
315        assert!(!result.contains("   "));
316        // No more than 2 consecutive newlines
317        assert!(!result.contains("\n\n\n"));
318    }
319
320    #[test]
321    fn test_br_tags() {
322        let html = "line1<br>line2<br/>line3<br />line4";
323        let result = html_to_text_impl(html);
324        assert!(result.contains("line1\nline2"));
325        assert!(result.contains("line3"));
326        assert!(result.contains("line4"));
327    }
328}