Skip to main content

html_cleaning/
cleaner.rs

1//! Core HTML cleaning functionality.
2
3use crate::options::CleaningOptions;
4use dom_query::Document;
5use std::collections::HashSet;
6
7/// HTML cleaning utility.
8///
9/// Provides methods for removing, stripping, and normalizing HTML elements.
10///
11/// # Example
12///
13/// ```
14/// use html_cleaning::{HtmlCleaner, CleaningOptions};
15/// use dom_query::Document;
16///
17/// let options = CleaningOptions {
18///     tags_to_remove: vec!["script".into(), "style".into()],
19///     prune_empty: true,
20///     ..Default::default()
21/// };
22///
23/// let cleaner = HtmlCleaner::with_options(options);
24/// let doc = Document::from("<div><script>x</script><p>Hello</p></div>");
25/// cleaner.clean(&doc);
26/// assert!(doc.select("script").is_empty());
27/// ```
28#[derive(Debug, Clone)]
29pub struct HtmlCleaner {
30    options: CleaningOptions,
31}
32
33impl Default for HtmlCleaner {
34    fn default() -> Self {
35        Self::new()
36    }
37}
38
39impl HtmlCleaner {
40    /// Create a cleaner with default options.
41    #[must_use]
42    pub fn new() -> Self {
43        Self {
44            options: CleaningOptions::default(),
45        }
46    }
47
48    /// Create a cleaner with custom options.
49    #[must_use]
50    pub fn with_options(options: CleaningOptions) -> Self {
51        Self { options }
52    }
53
54    /// Get a reference to the current options.
55    #[must_use]
56    pub fn options(&self) -> &CleaningOptions {
57        &self.options
58    }
59
60    /// Apply all configured cleaning operations to the document.
61    ///
62    /// Operations are applied in this order:
63    /// 1. Remove tags (with children)
64    /// 2. Strip tags (keep children)
65    /// 3. Remove by CSS selector
66    /// 4. Prune empty elements
67    /// 5. Normalize whitespace
68    /// 6. Clean attributes
69    pub fn clean(&self, doc: &Document) {
70        // 1. Remove tags
71        if !self.options.tags_to_remove.is_empty() {
72            let tags: Vec<&str> = self.options.tags_to_remove.iter().map(String::as_str).collect();
73            self.remove_tags(doc, &tags);
74        }
75
76        // 2. Strip tags
77        if !self.options.tags_to_strip.is_empty() {
78            let tags: Vec<&str> = self.options.tags_to_strip.iter().map(String::as_str).collect();
79            self.strip_tags(doc, &tags);
80        }
81
82        // 3. Remove by selector
83        for selector in &self.options.selectors_to_remove {
84            self.remove_by_selector(doc, selector);
85        }
86
87        // 4. Prune empty
88        if self.options.prune_empty {
89            self.prune_empty(doc);
90        }
91
92        // 5. Normalize whitespace
93        if self.options.normalize_whitespace {
94            self.normalize_text(doc);
95        }
96
97        // 6. Clean attributes
98        if self.options.strip_attributes {
99            self.clean_attributes(doc);
100        }
101    }
102
103    /// Remove elements matching tags (including all children).
104    ///
105    /// # Example
106    ///
107    /// ```
108    /// use html_cleaning::HtmlCleaner;
109    /// use dom_query::Document;
110    ///
111    /// let cleaner = HtmlCleaner::new();
112    /// let doc = Document::from("<div><script>bad</script><p>good</p></div>");
113    /// cleaner.remove_tags(&doc, &["script"]);
114    /// assert!(doc.select("script").is_empty());
115    /// ```
116    pub fn remove_tags(&self, doc: &Document, tags: &[&str]) {
117        if tags.is_empty() {
118            return;
119        }
120        let selector = tags.join(", ");
121        doc.select(&selector).remove();
122    }
123
124    /// Strip tags but preserve their children.
125    ///
126    /// The tag wrapper is removed but inner content (text and child elements)
127    /// is moved to the parent.
128    ///
129    /// # Example
130    ///
131    /// ```
132    /// use html_cleaning::HtmlCleaner;
133    /// use dom_query::Document;
134    ///
135    /// let cleaner = HtmlCleaner::new();
136    /// let doc = Document::from("<div><span>text</span></div>");
137    /// cleaner.strip_tags(&doc, &["span"]);
138    /// assert!(doc.select("span").is_empty());
139    /// ```
140    pub fn strip_tags(&self, doc: &Document, tags: &[&str]) {
141        if tags.is_empty() {
142            return;
143        }
144        let root = doc.select("*").first();
145        if root.exists() {
146            root.strip_elements(tags);
147        }
148    }
149
150    /// Remove elements matching a CSS selector.
151    ///
152    /// # Example
153    ///
154    /// ```
155    /// use html_cleaning::HtmlCleaner;
156    /// use dom_query::Document;
157    ///
158    /// let cleaner = HtmlCleaner::new();
159    /// let doc = Document::from(r#"<div class="ad">Ad</div><p>Content</p>"#);
160    /// cleaner.remove_by_selector(&doc, ".ad");
161    /// assert!(doc.select(".ad").is_empty());
162    /// ```
163    pub fn remove_by_selector(&self, doc: &Document, selector: &str) {
164        doc.select(selector).remove();
165    }
166
167    /// Remove empty elements.
168    ///
169    /// Elements are considered empty if they:
170    /// - Have no child elements
171    /// - Have no text content (or only whitespace)
172    ///
173    /// Processes in reverse document order (children before parents).
174    pub fn prune_empty(&self, doc: &Document) {
175        let empty_tags: Vec<&str> = if self.options.empty_tags.is_empty() {
176            vec!["div", "span", "p", "section", "article"]
177        } else {
178            self.options.empty_tags.iter().map(String::as_str).collect()
179        };
180
181        // Loop until no more empty elements found
182        loop {
183            let mut removed = false;
184            for tag in &empty_tags {
185                let nodes: Vec<_> = doc.select(tag).nodes().to_vec();
186                for node in nodes.into_iter().rev() {
187                    let sel = dom_query::Selection::from(node);
188                    let children = sel.children();
189                    let text = sel.text().to_string();
190
191                    if children.is_empty() && text.trim().is_empty() {
192                        sel.remove();
193                        removed = true;
194                    }
195                }
196            }
197            if !removed {
198                break;
199            }
200        }
201    }
202
203    /// Normalize text nodes (trim, collapse whitespace).
204    ///
205    /// Walks all text nodes and collapses multiple whitespace to single space.
206    pub fn normalize_text(&self, doc: &Document) {
207        // Process all elements and normalize their text content
208        for node in doc.select("*").nodes() {
209            let sel = dom_query::Selection::from(*node);
210
211            // Get direct text children and normalize them
212            if let Some(n) = sel.nodes().first() {
213                for child in n.children() {
214                    if child.is_text() {
215                        let text = child.text();
216                        let text_str = text.to_string();
217                        let normalized = crate::text::normalize(&text_str);
218                        if text_str != normalized {
219                            // Replace text node content by updating via the node
220                            child.set_text(normalized);
221                        }
222                    }
223                }
224            }
225        }
226    }
227
228    /// Remove or filter attributes from all elements.
229    ///
230    /// If `strip_attributes` is true in options:
231    /// - Removes all attributes except those in `preserved_attributes`
232    pub fn clean_attributes(&self, doc: &Document) {
233        let preserved: HashSet<&str> = self
234            .options
235            .preserved_attributes
236            .iter()
237            .map(String::as_str)
238            .collect();
239
240        for node in doc.select("*").nodes() {
241            let sel = dom_query::Selection::from(*node);
242
243            // Get all attribute names first
244            let attrs: Vec<String> = sel
245                .nodes()
246                .first()
247                .map(|n| {
248                    n.attrs()
249                        .iter()
250                        .map(|a| a.name.local.to_string())
251                        .collect()
252                })
253                .unwrap_or_default();
254
255            // Remove non-preserved attributes
256            for attr in attrs {
257                if !preserved.contains(attr.as_str()) {
258                    sel.remove_attr(&attr);
259                }
260            }
261        }
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_new_cleaner() {
271        let cleaner = HtmlCleaner::new();
272        assert!(cleaner.options().tags_to_remove.is_empty());
273    }
274
275    #[test]
276    fn test_remove_tags() {
277        let cleaner = HtmlCleaner::new();
278        let doc = Document::from("<div><script>bad</script><p>good</p></div>");
279        cleaner.remove_tags(&doc, &["script"]);
280        assert!(doc.select("script").is_empty());
281        assert!(doc.select("p").exists());
282    }
283
284    #[test]
285    fn test_remove_by_selector() {
286        let cleaner = HtmlCleaner::new();
287        let doc = Document::from(r#"<div class="ad">Ad</div><p>Content</p>"#);
288        cleaner.remove_by_selector(&doc, ".ad");
289        assert!(doc.select(".ad").is_empty());
290        assert!(doc.select("p").exists());
291    }
292
293    #[test]
294    fn test_prune_empty() {
295        let options = CleaningOptions {
296            prune_empty: true,
297            ..Default::default()
298        };
299        let cleaner = HtmlCleaner::with_options(options);
300        let doc = Document::from("<div><p></p><p>Content</p></div>");
301        cleaner.prune_empty(&doc);
302        assert_eq!(doc.select("p").length(), 1);
303    }
304
305    #[test]
306    fn test_clean_attributes() {
307        let options = CleaningOptions {
308            strip_attributes: true,
309            preserved_attributes: vec!["href".into()],
310            ..Default::default()
311        };
312        let cleaner = HtmlCleaner::with_options(options);
313        let doc = Document::from(r#"<a href="url" class="link" id="x">Link</a>"#);
314        cleaner.clean_attributes(&doc);
315
316        let a = doc.select("a");
317        assert!(a.attr("href").is_some());
318        assert!(a.attr("class").is_none());
319        assert!(a.attr("id").is_none());
320    }
321
322    #[test]
323    fn test_strip_tags_preserves_text() {
324        let cleaner = HtmlCleaner::new();
325        let doc = Document::from("<div><span>Hello</span> <b>World</b></div>");
326        cleaner.strip_tags(&doc, &["span", "b"]);
327
328        assert!(doc.select("span").is_empty());
329        assert!(doc.select("b").is_empty());
330        let text = doc.select("div").text();
331        assert!(text.contains("Hello"), "Text 'Hello' should be preserved");
332        assert!(text.contains("World"), "Text 'World' should be preserved");
333    }
334
335    #[test]
336    fn test_normalize_text() {
337        let options = CleaningOptions {
338            normalize_whitespace: true,
339            ..Default::default()
340        };
341        let cleaner = HtmlCleaner::with_options(options);
342        let doc = Document::from("<p>  Multiple   spaces   here  </p>");
343        cleaner.normalize_text(&doc);
344
345        let text = doc.select("p").text();
346        // Text should have collapsed whitespace
347        assert!(!text.contains("  "), "Multiple spaces should be collapsed");
348    }
349}