Skip to main content

html_cleaning/
dom.rs

1//! DOM helper utilities.
2//!
3//! Convenience functions for common DOM operations.
4
5use dom_query::Selection;
6pub use dom_query::Document;
7
8/// Get text content of element (recursive).
9///
10/// # Example
11///
12/// ```
13/// use html_cleaning::dom;
14///
15/// let doc = dom::parse("<div>Hello <span>World</span></div>");
16/// assert_eq!(dom::text_content(&doc.select("div")), "Hello World");
17/// ```
18#[must_use]
19pub fn text_content(sel: &Selection) -> String {
20    sel.text().to_string()
21}
22
23/// Get direct text of element (non-recursive, excludes nested element text).
24///
25/// # Example
26///
27/// ```
28/// use html_cleaning::dom;
29///
30/// let doc = dom::parse("<div>Direct <span>Nested</span> text</div>");
31/// let direct = dom::direct_text(&doc.select("div"));
32/// assert!(direct.contains("Direct"));
33/// assert!(!direct.contains("Nested"));
34/// ```
35#[must_use]
36pub fn direct_text(sel: &Selection) -> String {
37    sel.nodes()
38        .first()
39        .map(|node| {
40            node.children()
41                .into_iter()
42                .filter(dom_query::NodeRef::is_text)
43                .map(|text_node| text_node.text().to_string())
44                .collect::<String>()
45        })
46        .unwrap_or_default()
47}
48
49/// Get tag name (lowercase).
50///
51/// # Example
52///
53/// ```
54/// use html_cleaning::dom;
55///
56/// let doc = dom::parse("<ARTICLE>Content</ARTICLE>");
57/// assert_eq!(dom::tag_name(&doc.select("article")), Some("article".to_string()));
58/// ```
59#[must_use]
60pub fn tag_name(sel: &Selection) -> Option<String> {
61    sel.nodes()
62        .first()
63        .and_then(dom_query::NodeRef::node_name)
64        .map(|t| t.to_string())
65}
66
67/// Get attribute value.
68///
69/// # Example
70///
71/// ```
72/// use html_cleaning::dom;
73///
74/// let doc = dom::parse(r#"<a href="https://example.com">Link</a>"#);
75/// assert_eq!(dom::get_attribute(&doc.select("a"), "href"), Some("https://example.com".to_string()));
76/// ```
77#[must_use]
78pub fn get_attribute(sel: &Selection, name: &str) -> Option<String> {
79    sel.attr(name).map(|s| s.to_string())
80}
81
82/// Set attribute value.
83pub fn set_attribute(sel: &Selection, name: &str, value: &str) {
84    sel.set_attr(name, value);
85}
86
87/// Remove attribute.
88pub fn remove_attribute(sel: &Selection, name: &str) {
89    sel.remove_attr(name);
90}
91
92/// Check if attribute exists.
93#[must_use]
94pub fn has_attribute(sel: &Selection, name: &str) -> bool {
95    sel.has_attr(name)
96}
97
98/// Get all attributes as key-value pairs.
99#[must_use]
100pub fn get_all_attributes(sel: &Selection) -> Vec<(String, String)> {
101    sel.nodes()
102        .first()
103        .map(|node| {
104            node.attrs()
105                .iter()
106                .map(|attr| (attr.name.local.to_string(), attr.value.to_string()))
107                .collect()
108        })
109        .unwrap_or_default()
110}
111
112/// Get direct element children.
113#[must_use]
114pub fn children<'a>(sel: &Selection<'a>) -> Selection<'a> {
115    sel.children()
116}
117
118/// Get parent element.
119#[must_use]
120pub fn parent<'a>(sel: &Selection<'a>) -> Selection<'a> {
121    sel.parent()
122}
123
124/// Get next element sibling (skipping text nodes).
125#[must_use]
126pub fn next_element_sibling<'a>(sel: &Selection<'a>) -> Option<Selection<'a>> {
127    sel.nodes().first().and_then(|node| {
128        let mut sibling = node.next_sibling();
129        while let Some(s) = sibling {
130            if s.is_element() {
131                return Some(Selection::from(s));
132            }
133            sibling = s.next_sibling();
134        }
135        None
136    })
137}
138
139/// Get previous element sibling (skipping text nodes).
140#[must_use]
141pub fn previous_element_sibling<'a>(sel: &Selection<'a>) -> Option<Selection<'a>> {
142    sel.nodes().first().and_then(|node| {
143        let mut sibling = node.prev_sibling();
144        while let Some(s) = sibling {
145            if s.is_element() {
146                return Some(Selection::from(s));
147            }
148            sibling = s.prev_sibling();
149        }
150        None
151    })
152}
153
154/// Check if element is a void element (self-closing).
155#[must_use]
156pub fn is_void_element(sel: &Selection) -> bool {
157    const VOID_ELEMENTS: &[&str] = &[
158        "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
159        "source", "track", "wbr",
160    ];
161
162    tag_name(sel).is_some_and(|t| VOID_ELEMENTS.contains(&t.as_str()))
163}
164
165/// Check if element has specified class.
166#[must_use]
167pub fn has_class(sel: &Selection, class: &str) -> bool {
168    sel.attr("class")
169        .is_some_and(|c| c.split_whitespace().any(|c| c == class))
170}
171
172/// Add a class to the element.
173///
174/// # Example
175///
176/// ```
177/// use html_cleaning::dom;
178///
179/// let doc = dom::parse(r#"<div class="foo">Content</div>"#);
180/// let div = doc.select("div");
181/// dom::add_class(&div, "bar");
182/// assert!(dom::has_class(&div, "foo"));
183/// assert!(dom::has_class(&div, "bar"));
184/// ```
185pub fn add_class(sel: &Selection, class: &str) {
186    if class.is_empty() {
187        return;
188    }
189
190    match sel.attr("class") {
191        Some(existing) => {
192            // Check if class already exists
193            if !existing.split_whitespace().any(|c| c == class) {
194                let new_class = format!("{existing} {class}");
195                sel.set_attr("class", &new_class);
196            }
197        }
198        None => {
199            sel.set_attr("class", class);
200        }
201    }
202}
203
204/// Remove a class from the element.
205pub fn remove_class(sel: &Selection, class: &str) {
206    if let Some(existing) = sel.attr("class") {
207        let new_class: Vec<&str> = existing
208            .split_whitespace()
209            .filter(|c| *c != class)
210            .collect();
211
212        if new_class.is_empty() {
213            sel.remove_attr("class");
214        } else {
215            sel.set_attr("class", &new_class.join(" "));
216        }
217    }
218}
219
220/// Check if element matches a CSS selector.
221///
222/// # Example
223///
224/// ```
225/// use html_cleaning::dom;
226///
227/// let doc = dom::parse(r#"<div class="content" id="main">Text</div>"#);
228/// let div = doc.select("div");
229/// assert!(dom::matches(&div, ".content"));
230/// assert!(dom::matches(&div, "#main"));
231/// assert!(!dom::matches(&div, ".sidebar"));
232/// ```
233#[must_use]
234pub fn matches(sel: &Selection, selector: &str) -> bool {
235    sel.is(selector)
236}
237
238/// Get inner HTML content.
239#[must_use]
240pub fn inner_html(sel: &Selection) -> String {
241    sel.inner_html().to_string()
242}
243
244/// Get outer HTML content.
245#[must_use]
246pub fn outer_html(sel: &Selection) -> String {
247    sel.html().to_string()
248}
249
250/// Parse HTML string into Document.
251#[must_use]
252pub fn parse(html: &str) -> Document {
253    Document::from(html)
254}
255
256/// Clone a document.
257#[must_use]
258pub fn clone_document(doc: &Document) -> Document {
259    Document::from(doc.html().to_string())
260}
261
262/// Rename element tag.
263pub fn rename(sel: &Selection, new_tag: &str) {
264    sel.rename(new_tag);
265}
266
267/// Remove element from DOM.
268pub fn remove(sel: &Selection) {
269    sel.remove();
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn test_text_content() {
278        let doc = parse("<div>Hello <span>World</span></div>");
279        let div = doc.select("div");
280        assert_eq!(text_content(&div), "Hello World");
281    }
282
283    #[test]
284    fn test_tag_name() {
285        let doc = parse("<article>Content</article>");
286        let article = doc.select("article");
287        assert_eq!(tag_name(&article), Some("article".to_string()));
288    }
289
290    #[test]
291    fn test_attributes() {
292        let doc = parse(r#"<a href="url" class="link">Link</a>"#);
293        let a = doc.select("a");
294
295        assert_eq!(get_attribute(&a, "href"), Some("url".to_string()));
296        assert!(has_attribute(&a, "class"));
297        assert!(!has_attribute(&a, "id"));
298
299        let attrs = get_all_attributes(&a);
300        assert_eq!(attrs.len(), 2);
301    }
302
303    #[test]
304    fn test_is_void_element() {
305        let doc = parse("<div><br><img src='x'><p>text</p></div>");
306
307        assert!(is_void_element(&doc.select("br")));
308        assert!(is_void_element(&doc.select("img")));
309        assert!(!is_void_element(&doc.select("p")));
310        assert!(!is_void_element(&doc.select("div")));
311    }
312
313    #[test]
314    fn test_has_class() {
315        let doc = parse(r#"<div class="foo bar baz">Content</div>"#);
316        let div = doc.select("div");
317
318        assert!(has_class(&div, "foo"));
319        assert!(has_class(&div, "bar"));
320        assert!(!has_class(&div, "qux"));
321    }
322
323    #[test]
324    fn test_navigation() {
325        let doc = parse("<div><p>1</p><span>2</span><p>3</p></div>");
326
327        let span = doc.select("span");
328        let prev = previous_element_sibling(&span);
329        let next = next_element_sibling(&span);
330
331        assert!(prev.is_some());
332        assert_eq!(tag_name(&prev.unwrap()), Some("p".to_string()));
333        assert!(next.is_some());
334        assert_eq!(tag_name(&next.unwrap()), Some("p".to_string()));
335    }
336
337    #[test]
338    fn test_direct_text() {
339        let doc = parse("<div>Direct text<span>Nested</span> more direct</div>");
340        let div = doc.select("div");
341
342        let direct = direct_text(&div);
343        assert!(direct.contains("Direct text"));
344        assert!(direct.contains("more direct"));
345        assert!(!direct.contains("Nested"));
346    }
347
348    #[test]
349    fn test_matches() {
350        let doc = parse(r#"<div class="foo" id="bar">Content</div>"#);
351        let div = doc.select("div");
352
353        assert!(matches(&div, "div"));
354        assert!(matches(&div, ".foo"));
355        assert!(matches(&div, "#bar"));
356        assert!(matches(&div, "div.foo"));
357        assert!(!matches(&div, "span"));
358        assert!(!matches(&div, ".baz"));
359    }
360
361    #[test]
362    fn test_add_class() {
363        let doc = parse(r#"<div class="existing">Content</div>"#);
364        let div = doc.select("div");
365
366        add_class(&div, "new");
367        assert!(has_class(&div, "existing"));
368        assert!(has_class(&div, "new"));
369
370        // Adding same class again shouldn't duplicate
371        add_class(&div, "new");
372        let class_attr = get_attribute(&div, "class").unwrap();
373        assert_eq!(class_attr.matches("new").count(), 1);
374    }
375
376    #[test]
377    fn test_add_class_to_element_without_class() {
378        let doc = parse("<div>Content</div>");
379        let div = doc.select("div");
380
381        add_class(&div, "new");
382        assert!(has_class(&div, "new"));
383    }
384
385    #[test]
386    fn test_remove_class() {
387        let doc = parse(r#"<div class="foo bar baz">Content</div>"#);
388        let div = doc.select("div");
389
390        remove_class(&div, "bar");
391        assert!(has_class(&div, "foo"));
392        assert!(!has_class(&div, "bar"));
393        assert!(has_class(&div, "baz"));
394    }
395
396    #[test]
397    fn test_remove_last_class() {
398        let doc = parse(r#"<div class="only">Content</div>"#);
399        let div = doc.select("div");
400
401        remove_class(&div, "only");
402        assert!(!has_class(&div, "only"));
403        assert!(!has_attribute(&div, "class"));
404    }
405}