use regex::Regex;
pub fn html_to_text(html: &str) -> String {
html_to_text_impl(html)
}
pub fn html_to_text_impl(html: &str) -> String {
let mut text = html.to_string();
let re_head = Regex::new(r"(?is)<head[\s>].*?</head>").unwrap();
text = re_head.replace_all(&text, "").to_string();
let re_script = Regex::new(r"(?is)<script[\s>].*?</script>").unwrap();
text = re_script.replace_all(&text, "").to_string();
let re_style = Regex::new(r"(?is)<style[\s>].*?</style>").unwrap();
text = re_style.replace_all(&text, "").to_string();
let re_comment = Regex::new(r"(?s)<!--.*?-->").unwrap();
text = re_comment.replace_all(&text, "").to_string();
for level in 1..=6usize {
let pattern = format!(r"(?is)<h{0}\b[^>]*>(.*?)</h{0}\s*>", level);
let re = Regex::new(&pattern).unwrap();
let prefix = "#".repeat(level);
text = re
.replace_all(&text, |caps: ®ex::Captures| {
format!("\n{} {}\n", prefix, &caps[1])
})
.to_string();
}
let re_li = Regex::new(r"(?i)<li\b[^>]*>").unwrap();
text = re_li.replace_all(&text, "\n- ").to_string();
let re_b = Regex::new(r"(?is)<b\b[^>]*>(.*?)</b\s*>").unwrap();
text = re_b.replace_all(&text, "**$1**").to_string();
let re_strong = Regex::new(r"(?is)<strong\b[^>]*>(.*?)</strong\s*>").unwrap();
text = re_strong.replace_all(&text, "**$1**").to_string();
let re_img_alt = Regex::new(r#"(?i)<img\b[^>]*\balt=["']([^"']*)["'][^>]*/?\s*>"#).unwrap();
text = re_img_alt.replace_all(&text, "[image: $1]").to_string();
let re_img_no_alt = Regex::new(r"(?i)<img\b[^>]*/?\s*>").unwrap();
text = re_img_no_alt.replace_all(&text, "").to_string();
let re_cell = Regex::new(r"(?i)<(td|th)\b[^>]*>").unwrap();
text = re_cell.replace_all(&text, "\t").to_string();
let re_br = Regex::new(r"(?i)<br\b[^>]*/?\s*>").unwrap();
text = re_br.replace_all(&text, "\n").to_string();
let re_block_open = Regex::new(
r"(?i)<(p|div|section|article|table|tr|ul|ol|blockquote|header|footer|nav|main|aside|figure|figcaption|details|summary|dl|dt|dd|pre|address)\b[^>]*>",
)
.unwrap();
text = re_block_open.replace_all(&text, "\n").to_string();
let re_block_close = Regex::new(
r"(?i)</(p|div|section|article|table|tr|ul|ol|blockquote|header|footer|nav|main|aside|figure|figcaption|details|summary|dl|dt|dd|pre|address|h[1-6])>",
)
.unwrap();
text = re_block_close.replace_all(&text, "\n").to_string();
let re_tags = Regex::new(r"<[^>]+>").unwrap();
text = re_tags.replace_all(&text, "").to_string();
text = decode_entities(&text);
let re_hspace = Regex::new(r"[^\S\n]+").unwrap();
text = re_hspace.replace_all(&text, " ").to_string();
let re_blanks = Regex::new(r"\n{3,}").unwrap();
text = re_blanks.replace_all(&text, "\n\n").to_string();
let trimmed: Vec<&str> = text.lines().map(|l| l.trim()).collect();
text = trimmed.join("\n");
text.trim().to_string()
}
fn decode_entities(text: &str) -> String {
let mut s = text.to_string();
let entities: &[(&str, &str)] = &[
("<", "<"),
(">", ">"),
(""", "\""),
("'", "'"),
("'", "'"),
(" ", " "),
("—", "\u{2014}"),
("–", "\u{2013}"),
("«", "\u{00AB}"),
("»", "\u{00BB}"),
("…", "\u{2026}"),
("•", "\u{2022}"),
("‘", "\u{2018}"),
("’", "\u{2019}"),
("“", "\u{201C}"),
("”", "\u{201D}"),
("©", "\u{00A9}"),
("®", "\u{00AE}"),
("™", "\u{2122}"),
("§", "\u{00A7}"),
("¶", "\u{00B6}"),
("°", "\u{00B0}"),
("×", "\u{00D7}"),
("÷", "\u{00F7}"),
("½", "\u{00BD}"),
("¼", "\u{00BC}"),
("¾", "\u{00BE}"),
("±", "\u{00B1}"),
("µ", "\u{00B5}"),
("æ", "\u{00E6}"),
("Æ", "\u{00C6}"),
("ø", "\u{00F8}"),
("Ø", "\u{00D8}"),
("å", "\u{00E5}"),
("Å", "\u{00C5}"),
("ä", "\u{00E4}"),
("Ä", "\u{00C4}"),
("ö", "\u{00F6}"),
("Ö", "\u{00D6}"),
("ü", "\u{00FC}"),
("Ü", "\u{00DC}"),
("ß", "\u{00DF}"),
("ñ", "\u{00F1}"),
("Ñ", "\u{00D1}"),
("ç", "\u{00E7}"),
("Ç", "\u{00C7}"),
("é", "\u{00E9}"),
("É", "\u{00C9}"),
("è", "\u{00E8}"),
("È", "\u{00C8}"),
("ê", "\u{00EA}"),
("Ê", "\u{00CA}"),
("à", "\u{00E0}"),
("À", "\u{00C0}"),
("á", "\u{00E1}"),
("Á", "\u{00C1}"),
("â", "\u{00E2}"),
("Â", "\u{00C2}"),
("í", "\u{00ED}"),
("Í", "\u{00CD}"),
("ì", "\u{00EC}"),
("Ì", "\u{00CC}"),
("ô", "\u{00F4}"),
("Ô", "\u{00D4}"),
("ó", "\u{00F3}"),
("Ó", "\u{00D3}"),
("ò", "\u{00F2}"),
("Ò", "\u{00D2}"),
("ú", "\u{00FA}"),
("Ú", "\u{00DA}"),
("ù", "\u{00F9}"),
("Ù", "\u{00D9}"),
];
for &(entity, replacement) in entities {
s = s.replace(entity, replacement);
}
let re_dec = Regex::new(r"&#(\d+);").unwrap();
s = re_dec
.replace_all(&s, |caps: ®ex::Captures| {
caps[1]
.parse::<u32>()
.ok()
.and_then(char::from_u32)
.map(|c| c.to_string())
.unwrap_or_default()
})
.to_string();
let re_hex = Regex::new(r"(?i)&#x([0-9a-f]+);").unwrap();
s = re_hex
.replace_all(&s, |caps: ®ex::Captures| {
u32::from_str_radix(&caps[1], 16)
.ok()
.and_then(char::from_u32)
.map(|c| c.to_string())
.unwrap_or_default()
})
.to_string();
s = s.replace("&", "&");
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_head() {
let html =
"<html><head><title>T</title><style>body{}</style></head><body>Hello</body></html>";
let result = html_to_text_impl(html);
assert_eq!(result, "Hello");
}
#[test]
fn test_headings() {
let html = "<h1>Title</h1><h2>Sub</h2><p>Text</p>";
let result = html_to_text_impl(html);
assert!(result.contains("# Title"));
assert!(result.contains("## Sub"));
assert!(result.contains("Text"));
}
#[test]
fn test_list_items() {
let html = "<ul><li>Alpha</li><li>Beta</li></ul>";
let result = html_to_text_impl(html);
assert!(result.contains("- Alpha"));
assert!(result.contains("- Beta"));
}
#[test]
fn test_bold() {
let html = "<p>Hello <strong>world</strong> and <b>rust</b></p>";
let result = html_to_text_impl(html);
assert!(result.contains("**world**"));
assert!(result.contains("**rust**"));
}
#[test]
fn test_images() {
let html = r#"<img alt="logo" src="logo.png"><img src="spacer.gif">"#;
let result = html_to_text_impl(html);
assert!(result.contains("[image: logo]"));
assert!(!result.contains("spacer"));
}
#[test]
fn test_tables() {
let html =
"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>";
let result = html_to_text_impl(html);
assert!(result.contains("Name"));
assert!(result.contains("Age"));
assert!(result.contains("Alice"));
assert!(result.contains("30"));
}
#[test]
fn test_entities() {
let html = "<p><tag> & "quotes" © §</p>";
let result = html_to_text_impl(html);
assert!(result.contains("<tag>"));
assert!(result.contains("& \"quotes\""));
assert!(result.contains("\u{00A9}")); assert!(result.contains("\u{00A7}")); }
#[test]
fn test_double_encoded_entities() {
let html = "<p>&lt; should stay as &lt;</p>";
let result = html_to_text_impl(html);
assert!(result.contains("<"));
}
#[test]
fn test_script_style_removed() {
let html =
"<p>Before</p><script>alert('xss')</script><style>.a{color:red}</style><p>After</p>";
let result = html_to_text_impl(html);
assert!(result.contains("Before"));
assert!(result.contains("After"));
assert!(!result.contains("alert"));
assert!(!result.contains("color"));
}
#[test]
fn test_comments_removed() {
let html = "<p>A<!-- hidden -->B</p>";
let result = html_to_text_impl(html);
assert!(result.contains("AB") || result.contains("A B"));
assert!(!result.contains("hidden"));
}
#[test]
fn test_links_stripped() {
let html = r#"<a href="https://example.com">click here</a>"#;
let result = html_to_text_impl(html);
assert!(result.contains("click here"));
assert!(!result.contains("https://"));
}
#[test]
fn test_whitespace_collapsed() {
let html = "<p> lots of spaces </p>\n\n\n\n<p>after gap</p>";
let result = html_to_text_impl(html);
assert!(!result.contains(" "));
assert!(!result.contains("\n\n\n"));
}
#[test]
fn test_br_tags() {
let html = "line1<br>line2<br/>line3<br />line4";
let result = html_to_text_impl(html);
assert!(result.contains("line1\nline2"));
assert!(result.contains("line3"));
assert!(result.contains("line4"));
}
}