use std::sync::LazyLock;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;
static SCRIPT_BLOCK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap());
static STYLE_BLOCK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap());
static INLINE_HANDLER: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"(?is)\s+on\w+\s*=\s*("[^"]*"|'[^']*')"#).unwrap());
static HTML_COMMENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?s)<!--.*?-->").unwrap());
static HTML_TAG: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap());
static BLANK_LINES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
static INLINE_WS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
static HTML_ENTITY: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;").unwrap());
pub fn strip_noise(html: &str) -> String {
let mut out = SCRIPT_BLOCK.replace_all(html, "").to_string();
out = STYLE_BLOCK.replace_all(&out, "").to_string();
out = INLINE_HANDLER.replace_all(&out, "").to_string();
out = HTML_COMMENT.replace_all(&out, "").to_string();
out
}
pub fn collapse_blank_lines(text: &str) -> String {
BLANK_LINES.replace_all(text.trim(), "\n\n").to_string()
}
pub fn decode_html_entities(text: &str) -> String {
HTML_ENTITY
.replace_all(text, |caps: ®ex::Captures| {
match caps.get(0).unwrap().as_str() {
" " => " ",
"&" => "&",
"<" => "<",
">" => ">",
""" => "\"",
"'" => "'",
_ => " ",
}
.to_string()
})
.to_string()
}
pub fn to_plain_text(html: &str) -> String {
let stripped = strip_noise(html);
let no_tags = HTML_TAG.replace_all(&stripped, " ").to_string();
let decoded = decode_html_entities(&no_tags);
let normalized: String = decoded.nfc().collect();
let lines: Vec<String> = normalized
.lines()
.map(|line| INLINE_WS.replace_all(line.trim(), " ").trim().to_string())
.filter(|line| !line.is_empty())
.collect();
lines.join("\n")
}