use regex::Regex;
use std::sync::OnceLock;
pub fn plain_text_from_html(input: &str) -> String {
let no_script = script_re().replace_all(input, " ");
let with_breaks = break_re().replace_all(&no_script, "\n");
let no_tags = tag_re().replace_all(&with_breaks, " ");
decode_entities(&no_tags)
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn decode_entities(input: &str) -> String {
input
.replace(" ", " ")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
}
fn script_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r"(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>").unwrap()
})
}
fn break_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"(?i)<\s*(br|/p|/div|/li|/tr)\s*/?\s*>").unwrap())
}
fn tag_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"(?s)<[^>]+>").unwrap())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strips_common_html() {
assert_eq!(
plain_text_from_html("<div>苹果 <span>apple</span></div>"),
"苹果 apple"
);
}
}