1use crate::options::CleaningOptions;
4use dom_query::Document;
5use std::collections::HashSet;
6
7#[derive(Debug, Clone)]
29pub struct HtmlCleaner {
30 options: CleaningOptions,
31}
32
33impl Default for HtmlCleaner {
34 fn default() -> Self {
35 Self::new()
36 }
37}
38
39impl HtmlCleaner {
40 #[must_use]
42 pub fn new() -> Self {
43 Self {
44 options: CleaningOptions::default(),
45 }
46 }
47
48 #[must_use]
50 pub fn with_options(options: CleaningOptions) -> Self {
51 Self { options }
52 }
53
54 #[must_use]
56 pub fn options(&self) -> &CleaningOptions {
57 &self.options
58 }
59
60 pub fn clean(&self, doc: &Document) {
70 if !self.options.tags_to_remove.is_empty() {
72 let tags: Vec<&str> = self.options.tags_to_remove.iter().map(String::as_str).collect();
73 self.remove_tags(doc, &tags);
74 }
75
76 if !self.options.tags_to_strip.is_empty() {
78 let tags: Vec<&str> = self.options.tags_to_strip.iter().map(String::as_str).collect();
79 self.strip_tags(doc, &tags);
80 }
81
82 for selector in &self.options.selectors_to_remove {
84 self.remove_by_selector(doc, selector);
85 }
86
87 if self.options.prune_empty {
89 self.prune_empty(doc);
90 }
91
92 if self.options.normalize_whitespace {
94 self.normalize_text(doc);
95 }
96
97 if self.options.strip_attributes {
99 self.clean_attributes(doc);
100 }
101 }
102
103 pub fn remove_tags(&self, doc: &Document, tags: &[&str]) {
117 if tags.is_empty() {
118 return;
119 }
120 let selector = tags.join(", ");
121 doc.select(&selector).remove();
122 }
123
124 pub fn strip_tags(&self, doc: &Document, tags: &[&str]) {
141 if tags.is_empty() {
142 return;
143 }
144 let root = doc.select("*").first();
145 if root.exists() {
146 root.strip_elements(tags);
147 }
148 }
149
150 pub fn remove_by_selector(&self, doc: &Document, selector: &str) {
164 doc.select(selector).remove();
165 }
166
167 pub fn prune_empty(&self, doc: &Document) {
175 let empty_tags: Vec<&str> = if self.options.empty_tags.is_empty() {
176 vec!["div", "span", "p", "section", "article"]
177 } else {
178 self.options.empty_tags.iter().map(String::as_str).collect()
179 };
180
181 loop {
183 let mut removed = false;
184 for tag in &empty_tags {
185 let nodes: Vec<_> = doc.select(tag).nodes().to_vec();
186 for node in nodes.into_iter().rev() {
187 let sel = dom_query::Selection::from(node);
188 let children = sel.children();
189 let text = sel.text().to_string();
190
191 if children.is_empty() && text.trim().is_empty() {
192 sel.remove();
193 removed = true;
194 }
195 }
196 }
197 if !removed {
198 break;
199 }
200 }
201 }
202
203 pub fn normalize_text(&self, doc: &Document) {
207 for node in doc.select("*").nodes() {
209 let sel = dom_query::Selection::from(*node);
210
211 if let Some(n) = sel.nodes().first() {
213 for child in n.children() {
214 if child.is_text() {
215 let text = child.text();
216 let text_str = text.to_string();
217 let normalized = crate::text::normalize(&text_str);
218 if text_str != normalized {
219 child.set_text(normalized);
221 }
222 }
223 }
224 }
225 }
226 }
227
228 pub fn clean_attributes(&self, doc: &Document) {
233 let preserved: HashSet<&str> = self
234 .options
235 .preserved_attributes
236 .iter()
237 .map(String::as_str)
238 .collect();
239
240 for node in doc.select("*").nodes() {
241 let sel = dom_query::Selection::from(*node);
242
243 let attrs: Vec<String> = sel
245 .nodes()
246 .first()
247 .map(|n| {
248 n.attrs()
249 .iter()
250 .map(|a| a.name.local.to_string())
251 .collect()
252 })
253 .unwrap_or_default();
254
255 for attr in attrs {
257 if !preserved.contains(attr.as_str()) {
258 sel.remove_attr(&attr);
259 }
260 }
261 }
262 }
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 #[test]
270 fn test_new_cleaner() {
271 let cleaner = HtmlCleaner::new();
272 assert!(cleaner.options().tags_to_remove.is_empty());
273 }
274
275 #[test]
276 fn test_remove_tags() {
277 let cleaner = HtmlCleaner::new();
278 let doc = Document::from("<div><script>bad</script><p>good</p></div>");
279 cleaner.remove_tags(&doc, &["script"]);
280 assert!(doc.select("script").is_empty());
281 assert!(doc.select("p").exists());
282 }
283
284 #[test]
285 fn test_remove_by_selector() {
286 let cleaner = HtmlCleaner::new();
287 let doc = Document::from(r#"<div class="ad">Ad</div><p>Content</p>"#);
288 cleaner.remove_by_selector(&doc, ".ad");
289 assert!(doc.select(".ad").is_empty());
290 assert!(doc.select("p").exists());
291 }
292
293 #[test]
294 fn test_prune_empty() {
295 let options = CleaningOptions {
296 prune_empty: true,
297 ..Default::default()
298 };
299 let cleaner = HtmlCleaner::with_options(options);
300 let doc = Document::from("<div><p></p><p>Content</p></div>");
301 cleaner.prune_empty(&doc);
302 assert_eq!(doc.select("p").length(), 1);
303 }
304
305 #[test]
306 fn test_clean_attributes() {
307 let options = CleaningOptions {
308 strip_attributes: true,
309 preserved_attributes: vec!["href".into()],
310 ..Default::default()
311 };
312 let cleaner = HtmlCleaner::with_options(options);
313 let doc = Document::from(r#"<a href="url" class="link" id="x">Link</a>"#);
314 cleaner.clean_attributes(&doc);
315
316 let a = doc.select("a");
317 assert!(a.attr("href").is_some());
318 assert!(a.attr("class").is_none());
319 assert!(a.attr("id").is_none());
320 }
321
322 #[test]
323 fn test_strip_tags_preserves_text() {
324 let cleaner = HtmlCleaner::new();
325 let doc = Document::from("<div><span>Hello</span> <b>World</b></div>");
326 cleaner.strip_tags(&doc, &["span", "b"]);
327
328 assert!(doc.select("span").is_empty());
329 assert!(doc.select("b").is_empty());
330 let text = doc.select("div").text();
331 assert!(text.contains("Hello"), "Text 'Hello' should be preserved");
332 assert!(text.contains("World"), "Text 'World' should be preserved");
333 }
334
335 #[test]
336 fn test_normalize_text() {
337 let options = CleaningOptions {
338 normalize_whitespace: true,
339 ..Default::default()
340 };
341 let cleaner = HtmlCleaner::with_options(options);
342 let doc = Document::from("<p> Multiple spaces here </p>");
343 cleaner.normalize_text(&doc);
344
345 let text = doc.select("p").text();
346 assert!(!text.contains(" "), "Multiple spaces should be collapsed");
348 }
349}