Skip to main content

html_cleaning/
cleaner.rs

1//! Core HTML cleaning functionality.
2
3use crate::options::CleaningOptions;
4use dom_query::Document;
5use std::collections::HashSet;
6
7/// HTML cleaning utility.
8///
9/// Provides methods for removing, stripping, and normalizing HTML elements.
10///
11/// # Example
12///
13/// ```
14/// use html_cleaning::{HtmlCleaner, CleaningOptions};
15/// use dom_query::Document;
16///
17/// let options = CleaningOptions {
18///     tags_to_remove: vec!["script".into(), "style".into()],
19///     prune_empty: true,
20///     ..Default::default()
21/// };
22///
23/// let cleaner = HtmlCleaner::with_options(options);
24/// let doc = Document::from("<div><script>x</script><p>Hello</p></div>");
25/// cleaner.clean(&doc);
26/// assert!(doc.select("script").is_empty());
27/// ```
28#[derive(Debug, Clone)]
29pub struct HtmlCleaner {
30    options: CleaningOptions,
31}
32
33impl Default for HtmlCleaner {
34    fn default() -> Self {
35        Self::new()
36    }
37}
38
39impl HtmlCleaner {
40    /// Create a cleaner with default options.
41    #[must_use]
42    pub fn new() -> Self {
43        Self {
44            options: CleaningOptions::default(),
45        }
46    }
47
48    /// Create a cleaner with custom options.
49    #[must_use]
50    pub fn with_options(options: CleaningOptions) -> Self {
51        Self { options }
52    }
53
54    /// Get a reference to the current options.
55    #[must_use]
56    pub fn options(&self) -> &CleaningOptions {
57        &self.options
58    }
59
60    /// Apply all configured cleaning operations to the document.
61    ///
62    /// Operations are applied in this order:
63    /// 1. Remove tags (with children)
64    /// 2. Strip tags (keep children)
65    /// 3. Remove by CSS selector
66    /// 4. Remove HTML comments
67    /// 5. Prune empty elements
68    /// 6. Normalize whitespace
69    /// 7. Clean attributes
70    pub fn clean(&self, doc: &Document) {
71        // 1. Remove tags
72        if !self.options.tags_to_remove.is_empty() {
73            let tags: Vec<&str> = self.options.tags_to_remove.iter().map(String::as_str).collect();
74            self.remove_tags(doc, &tags);
75        }
76
77        // 2. Strip tags
78        if !self.options.tags_to_strip.is_empty() {
79            let tags: Vec<&str> = self.options.tags_to_strip.iter().map(String::as_str).collect();
80            self.strip_tags(doc, &tags);
81        }
82
83        // 3. Remove by selector
84        for selector in &self.options.selectors_to_remove {
85            self.remove_by_selector(doc, selector);
86        }
87
88        // 4. Remove HTML comments
89        if self.options.remove_comments {
90            self.remove_comments(doc);
91        }
92
93        // 5. Prune empty
94        if self.options.prune_empty {
95            self.prune_empty(doc);
96        }
97
98        // 6. Normalize whitespace
99        if self.options.normalize_whitespace {
100            self.normalize_text(doc);
101        }
102
103        // 7. Clean attributes
104        if self.options.strip_attributes {
105            self.clean_attributes(doc);
106        }
107    }
108
109    /// Remove HTML comment nodes from the document.
110    ///
111    /// Walks the entire DOM tree and removes all comment nodes (`<!-- ... -->`).
112    ///
113    /// # Example
114    ///
115    /// ```
116    /// use html_cleaning::HtmlCleaner;
117    /// use dom_query::Document;
118    ///
119    /// let cleaner = HtmlCleaner::new();
120    /// let doc = Document::from("<div><!-- comment --><p>Content</p></div>");
121    /// cleaner.remove_comments(&doc);
122    /// ```
123    pub fn remove_comments(&self, doc: &Document) {
124        // Walk all nodes and collect comment nodes
125        let body = doc.select("*");
126        for node in body.nodes() {
127            for child in node.children() {
128                if child.is_comment() {
129                    child.remove_from_parent();
130                }
131            }
132        }
133    }
134
135    /// Remove elements matching tags (including all children).
136    ///
137    /// # Example
138    ///
139    /// ```
140    /// use html_cleaning::HtmlCleaner;
141    /// use dom_query::Document;
142    ///
143    /// let cleaner = HtmlCleaner::new();
144    /// let doc = Document::from("<div><script>bad</script><p>good</p></div>");
145    /// cleaner.remove_tags(&doc, &["script"]);
146    /// assert!(doc.select("script").is_empty());
147    /// ```
148    pub fn remove_tags(&self, doc: &Document, tags: &[&str]) {
149        if tags.is_empty() {
150            return;
151        }
152        let selector = tags.join(", ");
153        doc.select(&selector).remove();
154    }
155
156    /// Strip tags but preserve their children.
157    ///
158    /// The tag wrapper is removed but inner content (text and child elements)
159    /// is moved to the parent.
160    ///
161    /// # Example
162    ///
163    /// ```
164    /// use html_cleaning::HtmlCleaner;
165    /// use dom_query::Document;
166    ///
167    /// let cleaner = HtmlCleaner::new();
168    /// let doc = Document::from("<div><span>text</span></div>");
169    /// cleaner.strip_tags(&doc, &["span"]);
170    /// assert!(doc.select("span").is_empty());
171    /// ```
172    pub fn strip_tags(&self, doc: &Document, tags: &[&str]) {
173        if tags.is_empty() {
174            return;
175        }
176        let root = doc.select("*").first();
177        if root.exists() {
178            root.strip_elements(tags);
179        }
180    }
181
182    /// Remove elements matching a CSS selector.
183    ///
184    /// # Example
185    ///
186    /// ```
187    /// use html_cleaning::HtmlCleaner;
188    /// use dom_query::Document;
189    ///
190    /// let cleaner = HtmlCleaner::new();
191    /// let doc = Document::from(r#"<div class="ad">Ad</div><p>Content</p>"#);
192    /// cleaner.remove_by_selector(&doc, ".ad");
193    /// assert!(doc.select(".ad").is_empty());
194    /// ```
195    pub fn remove_by_selector(&self, doc: &Document, selector: &str) {
196        doc.select(selector).remove();
197    }
198
199    /// Remove empty elements.
200    ///
201    /// Elements are considered empty if they:
202    /// - Have no child elements
203    /// - Have no text content (or only whitespace)
204    ///
205    /// Processes in reverse document order (children before parents).
206    pub fn prune_empty(&self, doc: &Document) {
207        let empty_tags: Vec<&str> = if self.options.empty_tags.is_empty() {
208            vec!["div", "span", "p", "section", "article"]
209        } else {
210            self.options.empty_tags.iter().map(String::as_str).collect()
211        };
212
213        // Loop until no more empty elements found
214        loop {
215            let mut removed = false;
216            for tag in &empty_tags {
217                let nodes: Vec<_> = doc.select(tag).nodes().to_vec();
218                for node in nodes.into_iter().rev() {
219                    let sel = dom_query::Selection::from(node);
220                    let children = sel.children();
221                    let text = sel.text().to_string();
222
223                    if children.is_empty() && text.trim().is_empty() {
224                        sel.remove();
225                        removed = true;
226                    }
227                }
228            }
229            if !removed {
230                break;
231            }
232        }
233    }
234
235    /// Normalize text nodes (trim, collapse whitespace).
236    ///
237    /// Walks all text nodes and collapses multiple whitespace to single space.
238    pub fn normalize_text(&self, doc: &Document) {
239        // Process all elements and normalize their text content
240        for node in doc.select("*").nodes() {
241            let sel = dom_query::Selection::from(*node);
242
243            // Get direct text children and normalize them
244            if let Some(n) = sel.nodes().first() {
245                for child in n.children() {
246                    if child.is_text() {
247                        let text = child.text();
248                        let text_str = text.to_string();
249                        let normalized = crate::text::normalize(&text_str);
250                        if text_str != normalized {
251                            // Replace text node content by updating via the node
252                            child.set_text(normalized);
253                        }
254                    }
255                }
256            }
257        }
258    }
259
260    /// Remove or filter attributes from all elements.
261    ///
262    /// If `strip_attributes` is true in options:
263    /// - Removes all attributes except those in `preserved_attributes`
264    pub fn clean_attributes(&self, doc: &Document) {
265        let preserved: HashSet<&str> = self
266            .options
267            .preserved_attributes
268            .iter()
269            .map(String::as_str)
270            .collect();
271
272        for node in doc.select("*").nodes() {
273            let sel = dom_query::Selection::from(*node);
274
275            // Get all attribute names first
276            let attrs: Vec<String> = sel
277                .nodes()
278                .first()
279                .map(|n| {
280                    n.attrs()
281                        .iter()
282                        .map(|a| a.name.local.to_string())
283                        .collect()
284                })
285                .unwrap_or_default();
286
287            // Remove non-preserved attributes
288            for attr in attrs {
289                if !preserved.contains(attr.as_str()) {
290                    sel.remove_attr(&attr);
291                }
292            }
293        }
294    }
295}
296
297#[cfg(test)]
298mod tests {
299    use super::*;
300
301    #[test]
302    fn test_new_cleaner() {
303        let cleaner = HtmlCleaner::new();
304        assert!(cleaner.options().tags_to_remove.is_empty());
305    }
306
307    #[test]
308    fn test_remove_tags() {
309        let cleaner = HtmlCleaner::new();
310        let doc = Document::from("<div><script>bad</script><p>good</p></div>");
311        cleaner.remove_tags(&doc, &["script"]);
312        assert!(doc.select("script").is_empty());
313        assert!(doc.select("p").exists());
314    }
315
316    #[test]
317    fn test_remove_by_selector() {
318        let cleaner = HtmlCleaner::new();
319        let doc = Document::from(r#"<div class="ad">Ad</div><p>Content</p>"#);
320        cleaner.remove_by_selector(&doc, ".ad");
321        assert!(doc.select(".ad").is_empty());
322        assert!(doc.select("p").exists());
323    }
324
325    #[test]
326    fn test_prune_empty() {
327        let options = CleaningOptions {
328            prune_empty: true,
329            ..Default::default()
330        };
331        let cleaner = HtmlCleaner::with_options(options);
332        let doc = Document::from("<div><p></p><p>Content</p></div>");
333        cleaner.prune_empty(&doc);
334        assert_eq!(doc.select("p").length(), 1);
335    }
336
337    #[test]
338    fn test_clean_attributes() {
339        let options = CleaningOptions {
340            strip_attributes: true,
341            preserved_attributes: vec!["href".into()],
342            ..Default::default()
343        };
344        let cleaner = HtmlCleaner::with_options(options);
345        let doc = Document::from(r#"<a href="url" class="link" id="x">Link</a>"#);
346        cleaner.clean_attributes(&doc);
347
348        let a = doc.select("a");
349        assert!(a.attr("href").is_some());
350        assert!(a.attr("class").is_none());
351        assert!(a.attr("id").is_none());
352    }
353
354    #[test]
355    fn test_strip_tags_preserves_text() {
356        let cleaner = HtmlCleaner::new();
357        let doc = Document::from("<div><span>Hello</span> <b>World</b></div>");
358        cleaner.strip_tags(&doc, &["span", "b"]);
359
360        assert!(doc.select("span").is_empty());
361        assert!(doc.select("b").is_empty());
362        let text = doc.select("div").text();
363        assert!(text.contains("Hello"), "Text 'Hello' should be preserved");
364        assert!(text.contains("World"), "Text 'World' should be preserved");
365    }
366
367    #[test]
368    fn test_remove_comments() {
369        let cleaner = HtmlCleaner::new();
370        let doc = Document::from("<div><!-- This is a comment --><p>Content</p><!-- Another --></div>");
371        cleaner.remove_comments(&doc);
372        let html = doc.select("div").html().to_string();
373        assert!(!html.contains("comment"), "Comments should be removed: {html}");
374        assert!(html.contains("Content"), "Content should be preserved");
375    }
376
377    #[test]
378    fn test_clean_with_comments_option() {
379        let options = CleaningOptions {
380            remove_comments: true,
381            ..Default::default()
382        };
383        let cleaner = HtmlCleaner::with_options(options);
384        let doc = Document::from("<div><!-- comment --><p>Text</p></div>");
385        cleaner.clean(&doc);
386        let html = doc.select("div").html().to_string();
387        assert!(!html.contains("comment"));
388    }
389
390    #[test]
391    fn test_normalize_text() {
392        let options = CleaningOptions {
393            normalize_whitespace: true,
394            ..Default::default()
395        };
396        let cleaner = HtmlCleaner::with_options(options);
397        let doc = Document::from("<p>  Multiple   spaces   here  </p>");
398        cleaner.normalize_text(&doc);
399
400        let text = doc.select("p").text();
401        // Text should have collapsed whitespace
402        assert!(!text.contains("  "), "Multiple spaces should be collapsed");
403    }
404}