use scraper::Html;
const REMOVE_TAGS: &[&str] = &[
"script", "style", "head", "noscript",
"input", "button", "select", "textarea",
"embed", "object", "applet", "iframe", "layer", "param",
];
const REMOVE_TAG_KEEP_CHILDREN: &[&str] = &["form"];
pub fn preprocess(html: &str) -> Html {
let cleaned = remove_tags_and_comments(html);
Html::parse_document(&cleaned)
}
fn remove_tags_and_comments(html: &str) -> String {
let doc = Html::parse_document(html);
let mut out = String::with_capacity(html.len());
serialize_node(&doc.tree.root(), &mut out);
out
}
fn serialize_node(node: &ego_tree::NodeRef<scraper::node::Node>, out: &mut String) {
use scraper::node::Node;
match node.value() {
Node::Document => {
for child in node.children() {
serialize_node(&child, out);
}
}
Node::Element(el) => {
let tag = el.name();
if REMOVE_TAGS.contains(&tag) {
return; }
if REMOVE_TAG_KEEP_CHILDREN.contains(&tag) {
for child in node.children() {
serialize_node(&child, out);
}
return;
}
out.push('<');
out.push_str(tag);
for (attr, val) in el.attrs() {
out.push(' ');
out.push_str(attr);
out.push_str("=\"");
escape_attr(val, out);
out.push('"');
}
if is_void_element(tag) {
out.push_str(" />");
} else {
out.push('>');
for child in node.children() {
serialize_node(&child, out);
}
out.push_str("</");
out.push_str(tag);
out.push('>');
}
}
Node::Text(text) => {
for ch in text.text.chars() {
match ch {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
_ => out.push(ch),
}
}
}
Node::Comment(_) | Node::ProcessingInstruction(_) | Node::Doctype(_) => {}
Node::Fragment => {
for child in node.children() {
serialize_node(&child, out);
}
}
}
}
fn escape_attr(val: &str, out: &mut String) {
for ch in val.chars() {
match ch {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
'"' => out.push_str("""),
_ => out.push(ch),
}
}
}
fn is_void_element(tag: &str) -> bool {
matches!(
tag,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}
#[cfg(test)]
mod tests {
use super::*;
fn text_content(doc: &Html) -> String {
doc.tree
.nodes()
.filter_map(|n| {
if let scraper::node::Node::Text(t) = n.value() {
Some(t.text.as_ref())
} else {
None
}
})
.collect::<Vec<_>>()
.join("")
}
fn has_tag(doc: &Html, tag: &str) -> bool {
let sel = scraper::Selector::parse(tag).unwrap();
doc.select(&sel).next().is_some()
}
#[test]
fn test_remove_head_tag() {
let html = "<html><head><title>Title</title></head><body><p>text</p></body></html>";
let doc = preprocess(html);
assert!(
!has_tag(&doc, "title"),
"<title> should be removed with <head>"
);
assert!(has_tag(&doc, "p"));
}
#[test]
fn test_remove_script() {
let html = "<html><body><script>alert('x')</script><p>text</p></body></html>";
let doc = preprocess(html);
assert!(!has_tag(&doc, "script"));
assert!(has_tag(&doc, "p"));
}
#[test]
fn test_remove_style() {
let html = "<html><body><style>body{color:red}</style><p>text</p></body></html>";
let doc = preprocess(html);
assert!(!has_tag(&doc, "style"));
assert!(has_tag(&doc, "p"));
}
#[test]
fn test_remove_form_family() {
let html = "<html><body><form><input type=\"text\" /><button>Go</button></form><p>text</p></body></html>";
let doc = preprocess(html);
assert!(!has_tag(&doc, "form"));
assert!(!has_tag(&doc, "input"));
assert!(!has_tag(&doc, "button"));
assert!(has_tag(&doc, "p"));
}
#[test]
fn test_form_wrapper_text_preserved() {
let html = "<html><body><form id=\"main\"><p>Article content</p></form></body></html>";
let doc = preprocess(html);
assert!(!has_tag(&doc, "form"), "form tag should be removed");
assert!(has_tag(&doc, "p"), "child <p> should survive");
let content = text_content(&doc);
assert!(
content.contains("Article content"),
"text inside form should be preserved"
);
}
#[test]
fn test_remove_comments() {
let html = "<html><body><!-- a comment --><p>text</p></body></html>";
let doc = preprocess(html);
let has_comment = doc
.tree
.nodes()
.any(|n| matches!(n.value(), scraper::node::Node::Comment(_)));
assert!(!has_comment);
assert!(has_tag(&doc, "p"));
}
#[test]
fn test_remove_embedded_layer() {
let html = "<html><body><layer>plugin</layer><p>text</p></body></html>";
let doc = preprocess(html);
assert!(!has_tag(&doc, "layer"));
assert!(has_tag(&doc, "p"));
}
#[test]
fn test_remove_embedded_param() {
let html = "<html><body><object><param name=\"src\" value=\"x\" /></object><p>text</p></body></html>";
let doc = preprocess(html);
assert!(!has_tag(&doc, "param"));
assert!(!has_tag(&doc, "object"));
}
#[test]
fn test_preserve_content() {
let html = "<html><body><p>Hello <em>world</em></p></body></html>";
let doc = preprocess(html);
assert!(has_tag(&doc, "p"));
assert!(has_tag(&doc, "em"));
let content = text_content(&doc);
assert!(content.contains("Hello"));
assert!(content.contains("world"));
}
#[test]
fn test_attribute_ampersand_survives_double_parse() {
let html = r#"<html><body><a href="/?a=1&b=2">link</a></body></html>"#;
let doc = preprocess(html);
let sel = scraper::Selector::parse("a").unwrap();
let href = doc
.select(&sel)
.next()
.unwrap()
.value()
.attr("href")
.unwrap();
assert_eq!(href, "/?a=1&b=2", "decoded & must survive the double parse");
}
#[test]
fn test_text_entities_not_reparsed_as_tags() {
let html = "<html><body><p>Use <year> as placeholder</p></body></html>";
let doc = preprocess(html);
let content = text_content(&doc);
assert!(
content.contains("<year>"),
"decoded entity text should be preserved as text"
);
assert!(
!has_tag(&doc, "year"),
"<year> must not become a DOM element"
);
}
}