1use crate::options::CleaningOptions;
4use dom_query::Document;
5use std::collections::HashSet;
6
7#[derive(Debug, Clone)]
29pub struct HtmlCleaner {
30 options: CleaningOptions,
31}
32
33impl Default for HtmlCleaner {
34 fn default() -> Self {
35 Self::new()
36 }
37}
38
39impl HtmlCleaner {
40 #[must_use]
42 pub fn new() -> Self {
43 Self {
44 options: CleaningOptions::default(),
45 }
46 }
47
48 #[must_use]
50 pub fn with_options(options: CleaningOptions) -> Self {
51 Self { options }
52 }
53
54 #[must_use]
56 pub fn options(&self) -> &CleaningOptions {
57 &self.options
58 }
59
60 pub fn clean(&self, doc: &Document) {
71 if !self.options.tags_to_remove.is_empty() {
73 let tags: Vec<&str> = self.options.tags_to_remove.iter().map(String::as_str).collect();
74 self.remove_tags(doc, &tags);
75 }
76
77 if !self.options.tags_to_strip.is_empty() {
79 let tags: Vec<&str> = self.options.tags_to_strip.iter().map(String::as_str).collect();
80 self.strip_tags(doc, &tags);
81 }
82
83 for selector in &self.options.selectors_to_remove {
85 self.remove_by_selector(doc, selector);
86 }
87
88 if self.options.remove_comments {
90 self.remove_comments(doc);
91 }
92
93 if self.options.prune_empty {
95 self.prune_empty(doc);
96 }
97
98 if self.options.normalize_whitespace {
100 self.normalize_text(doc);
101 }
102
103 if self.options.strip_attributes {
105 self.clean_attributes(doc);
106 }
107 }
108
109 pub fn remove_comments(&self, doc: &Document) {
124 let body = doc.select("*");
126 for node in body.nodes() {
127 for child in node.children() {
128 if child.is_comment() {
129 child.remove_from_parent();
130 }
131 }
132 }
133 }
134
135 pub fn remove_tags(&self, doc: &Document, tags: &[&str]) {
149 if tags.is_empty() {
150 return;
151 }
152 let selector = tags.join(", ");
153 doc.select(&selector).remove();
154 }
155
156 pub fn strip_tags(&self, doc: &Document, tags: &[&str]) {
173 if tags.is_empty() {
174 return;
175 }
176 let root = doc.select("*").first();
177 if root.exists() {
178 root.strip_elements(tags);
179 }
180 }
181
182 pub fn remove_by_selector(&self, doc: &Document, selector: &str) {
196 doc.select(selector).remove();
197 }
198
199 pub fn prune_empty(&self, doc: &Document) {
207 let empty_tags: Vec<&str> = if self.options.empty_tags.is_empty() {
208 vec!["div", "span", "p", "section", "article"]
209 } else {
210 self.options.empty_tags.iter().map(String::as_str).collect()
211 };
212
213 loop {
215 let mut removed = false;
216 for tag in &empty_tags {
217 let nodes: Vec<_> = doc.select(tag).nodes().to_vec();
218 for node in nodes.into_iter().rev() {
219 let sel = dom_query::Selection::from(node);
220 let children = sel.children();
221 let text = sel.text().to_string();
222
223 if children.is_empty() && text.trim().is_empty() {
224 sel.remove();
225 removed = true;
226 }
227 }
228 }
229 if !removed {
230 break;
231 }
232 }
233 }
234
235 pub fn normalize_text(&self, doc: &Document) {
239 for node in doc.select("*").nodes() {
241 let sel = dom_query::Selection::from(*node);
242
243 if let Some(n) = sel.nodes().first() {
245 for child in n.children() {
246 if child.is_text() {
247 let text = child.text();
248 let text_str = text.to_string();
249 let normalized = crate::text::normalize(&text_str);
250 if text_str != normalized {
251 child.set_text(normalized);
253 }
254 }
255 }
256 }
257 }
258 }
259
260 pub fn clean_attributes(&self, doc: &Document) {
265 let preserved: HashSet<&str> = self
266 .options
267 .preserved_attributes
268 .iter()
269 .map(String::as_str)
270 .collect();
271
272 for node in doc.select("*").nodes() {
273 let sel = dom_query::Selection::from(*node);
274
275 let attrs: Vec<String> = sel
277 .nodes()
278 .first()
279 .map(|n| {
280 n.attrs()
281 .iter()
282 .map(|a| a.name.local.to_string())
283 .collect()
284 })
285 .unwrap_or_default();
286
287 for attr in attrs {
289 if !preserved.contains(attr.as_str()) {
290 sel.remove_attr(&attr);
291 }
292 }
293 }
294 }
295}
296
297#[cfg(test)]
298mod tests {
299 use super::*;
300
301 #[test]
302 fn test_new_cleaner() {
303 let cleaner = HtmlCleaner::new();
304 assert!(cleaner.options().tags_to_remove.is_empty());
305 }
306
307 #[test]
308 fn test_remove_tags() {
309 let cleaner = HtmlCleaner::new();
310 let doc = Document::from("<div><script>bad</script><p>good</p></div>");
311 cleaner.remove_tags(&doc, &["script"]);
312 assert!(doc.select("script").is_empty());
313 assert!(doc.select("p").exists());
314 }
315
316 #[test]
317 fn test_remove_by_selector() {
318 let cleaner = HtmlCleaner::new();
319 let doc = Document::from(r#"<div class="ad">Ad</div><p>Content</p>"#);
320 cleaner.remove_by_selector(&doc, ".ad");
321 assert!(doc.select(".ad").is_empty());
322 assert!(doc.select("p").exists());
323 }
324
325 #[test]
326 fn test_prune_empty() {
327 let options = CleaningOptions {
328 prune_empty: true,
329 ..Default::default()
330 };
331 let cleaner = HtmlCleaner::with_options(options);
332 let doc = Document::from("<div><p></p><p>Content</p></div>");
333 cleaner.prune_empty(&doc);
334 assert_eq!(doc.select("p").length(), 1);
335 }
336
337 #[test]
338 fn test_clean_attributes() {
339 let options = CleaningOptions {
340 strip_attributes: true,
341 preserved_attributes: vec!["href".into()],
342 ..Default::default()
343 };
344 let cleaner = HtmlCleaner::with_options(options);
345 let doc = Document::from(r#"<a href="url" class="link" id="x">Link</a>"#);
346 cleaner.clean_attributes(&doc);
347
348 let a = doc.select("a");
349 assert!(a.attr("href").is_some());
350 assert!(a.attr("class").is_none());
351 assert!(a.attr("id").is_none());
352 }
353
354 #[test]
355 fn test_strip_tags_preserves_text() {
356 let cleaner = HtmlCleaner::new();
357 let doc = Document::from("<div><span>Hello</span> <b>World</b></div>");
358 cleaner.strip_tags(&doc, &["span", "b"]);
359
360 assert!(doc.select("span").is_empty());
361 assert!(doc.select("b").is_empty());
362 let text = doc.select("div").text();
363 assert!(text.contains("Hello"), "Text 'Hello' should be preserved");
364 assert!(text.contains("World"), "Text 'World' should be preserved");
365 }
366
367 #[test]
368 fn test_remove_comments() {
369 let cleaner = HtmlCleaner::new();
370 let doc = Document::from("<div><!-- This is a comment --><p>Content</p><!-- Another --></div>");
371 cleaner.remove_comments(&doc);
372 let html = doc.select("div").html().to_string();
373 assert!(!html.contains("comment"), "Comments should be removed: {html}");
374 assert!(html.contains("Content"), "Content should be preserved");
375 }
376
377 #[test]
378 fn test_clean_with_comments_option() {
379 let options = CleaningOptions {
380 remove_comments: true,
381 ..Default::default()
382 };
383 let cleaner = HtmlCleaner::with_options(options);
384 let doc = Document::from("<div><!-- comment --><p>Text</p></div>");
385 cleaner.clean(&doc);
386 let html = doc.select("div").html().to_string();
387 assert!(!html.contains("comment"));
388 }
389
390 #[test]
391 fn test_normalize_text() {
392 let options = CleaningOptions {
393 normalize_whitespace: true,
394 ..Default::default()
395 };
396 let cleaner = HtmlCleaner::with_options(options);
397 let doc = Document::from("<p> Multiple spaces here </p>");
398 cleaner.normalize_text(&doc);
399
400 let text = doc.select("p").text();
401 assert!(!text.contains(" "), "Multiple spaces should be collapsed");
403 }
404}