use crate::options::CleaningOptions;
use dom_query::Document;
use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct HtmlCleaner {
options: CleaningOptions,
}
impl Default for HtmlCleaner {
fn default() -> Self {
Self::new()
}
}
impl HtmlCleaner {
#[must_use]
pub fn new() -> Self {
Self {
options: CleaningOptions::default(),
}
}
#[must_use]
pub fn with_options(options: CleaningOptions) -> Self {
Self { options }
}
#[must_use]
pub fn options(&self) -> &CleaningOptions {
&self.options
}
pub fn clean(&self, doc: &Document) {
if !self.options.tags_to_remove.is_empty() {
let tags: Vec<&str> = self.options.tags_to_remove.iter().map(String::as_str).collect();
self.remove_tags(doc, &tags);
}
if !self.options.tags_to_strip.is_empty() {
let tags: Vec<&str> = self.options.tags_to_strip.iter().map(String::as_str).collect();
self.strip_tags(doc, &tags);
}
for selector in &self.options.selectors_to_remove {
self.remove_by_selector(doc, selector);
}
if self.options.remove_comments {
self.remove_comments(doc);
}
if self.options.prune_empty {
self.prune_empty(doc);
}
if self.options.normalize_whitespace {
self.normalize_text(doc);
}
if self.options.strip_attributes {
self.clean_attributes(doc);
}
}
pub fn remove_comments(&self, doc: &Document) {
let body = doc.select("*");
for node in body.nodes() {
for child in node.children() {
if child.is_comment() {
child.remove_from_parent();
}
}
}
}
pub fn remove_tags(&self, doc: &Document, tags: &[&str]) {
if tags.is_empty() {
return;
}
let selector = tags.join(", ");
doc.select(&selector).remove();
}
pub fn strip_tags(&self, doc: &Document, tags: &[&str]) {
if tags.is_empty() {
return;
}
let root = doc.select("*").first();
if root.exists() {
root.strip_elements(tags);
}
}
pub fn remove_by_selector(&self, doc: &Document, selector: &str) {
doc.select(selector).remove();
}
pub fn prune_empty(&self, doc: &Document) {
let empty_tags: Vec<&str> = if self.options.empty_tags.is_empty() {
vec!["div", "span", "p", "section", "article"]
} else {
self.options.empty_tags.iter().map(String::as_str).collect()
};
loop {
let mut removed = false;
for tag in &empty_tags {
let nodes: Vec<_> = doc.select(tag).nodes().to_vec();
for node in nodes.into_iter().rev() {
let sel = dom_query::Selection::from(node);
let children = sel.children();
let text = sel.text().to_string();
if children.is_empty() && text.trim().is_empty() {
sel.remove();
removed = true;
}
}
}
if !removed {
break;
}
}
}
pub fn normalize_text(&self, doc: &Document) {
for node in doc.select("*").nodes() {
let sel = dom_query::Selection::from(*node);
if let Some(n) = sel.nodes().first() {
for child in n.children() {
if child.is_text() {
let text = child.text();
let text_str = text.to_string();
let normalized = crate::text::normalize(&text_str);
if text_str != normalized {
child.set_text(normalized);
}
}
}
}
}
}
pub fn clean_attributes(&self, doc: &Document) {
let preserved: HashSet<&str> = self
.options
.preserved_attributes
.iter()
.map(String::as_str)
.collect();
for node in doc.select("*").nodes() {
let sel = dom_query::Selection::from(*node);
let attrs: Vec<String> = sel
.nodes()
.first()
.map(|n| {
n.attrs()
.iter()
.map(|a| a.name.local.to_string())
.collect()
})
.unwrap_or_default();
for attr in attrs {
if !preserved.contains(attr.as_str()) {
sel.remove_attr(&attr);
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_cleaner() {
let cleaner = HtmlCleaner::new();
assert!(cleaner.options().tags_to_remove.is_empty());
}
#[test]
fn test_remove_tags() {
let cleaner = HtmlCleaner::new();
let doc = Document::from("<div><script>bad</script><p>good</p></div>");
cleaner.remove_tags(&doc, &["script"]);
assert!(doc.select("script").is_empty());
assert!(doc.select("p").exists());
}
#[test]
fn test_remove_by_selector() {
let cleaner = HtmlCleaner::new();
let doc = Document::from(r#"<div class="ad">Ad</div><p>Content</p>"#);
cleaner.remove_by_selector(&doc, ".ad");
assert!(doc.select(".ad").is_empty());
assert!(doc.select("p").exists());
}
#[test]
fn test_prune_empty() {
let options = CleaningOptions {
prune_empty: true,
..Default::default()
};
let cleaner = HtmlCleaner::with_options(options);
let doc = Document::from("<div><p></p><p>Content</p></div>");
cleaner.prune_empty(&doc);
assert_eq!(doc.select("p").length(), 1);
}
#[test]
fn test_clean_attributes() {
let options = CleaningOptions {
strip_attributes: true,
preserved_attributes: vec!["href".into()],
..Default::default()
};
let cleaner = HtmlCleaner::with_options(options);
let doc = Document::from(r#"<a href="url" class="link" id="x">Link</a>"#);
cleaner.clean_attributes(&doc);
let a = doc.select("a");
assert!(a.attr("href").is_some());
assert!(a.attr("class").is_none());
assert!(a.attr("id").is_none());
}
#[test]
fn test_strip_tags_preserves_text() {
let cleaner = HtmlCleaner::new();
let doc = Document::from("<div><span>Hello</span> <b>World</b></div>");
cleaner.strip_tags(&doc, &["span", "b"]);
assert!(doc.select("span").is_empty());
assert!(doc.select("b").is_empty());
let text = doc.select("div").text();
assert!(text.contains("Hello"), "Text 'Hello' should be preserved");
assert!(text.contains("World"), "Text 'World' should be preserved");
}
#[test]
fn test_remove_comments() {
let cleaner = HtmlCleaner::new();
let doc = Document::from("<div><!-- This is a comment --><p>Content</p><!-- Another --></div>");
cleaner.remove_comments(&doc);
let html = doc.select("div").html().to_string();
assert!(!html.contains("comment"), "Comments should be removed: {html}");
assert!(html.contains("Content"), "Content should be preserved");
}
#[test]
fn test_clean_with_comments_option() {
let options = CleaningOptions {
remove_comments: true,
..Default::default()
};
let cleaner = HtmlCleaner::with_options(options);
let doc = Document::from("<div><!-- comment --><p>Text</p></div>");
cleaner.clean(&doc);
let html = doc.select("div").html().to_string();
assert!(!html.contains("comment"));
}
#[test]
fn test_normalize_text() {
let options = CleaningOptions {
normalize_whitespace: true,
..Default::default()
};
let cleaner = HtmlCleaner::with_options(options);
let doc = Document::from("<p> Multiple spaces here </p>");
cleaner.normalize_text(&doc);
let text = doc.select("p").text();
assert!(!text.contains(" "), "Multiple spaces should be collapsed");
}
}