Skip to main content

html_cleaning/
tree.rs

1//! Tree manipulation with lxml-style text/tail model.
2//!
3//! This module provides functions for working with the text/tail model
4//! used in lxml-style HTML processing.
5//!
6//! ## Text vs Tail
7//!
8//! In this model, elements have:
9//! - **Text**: Text content BEFORE the first child element
10//! - **Tail**: Text content AFTER the element's closing tag
11//!
12//! ```html
13//! <div>
14//!   TEXT HERE          <!-- div's "text" -->
15//!   <span>inner</span>
16//!   TAIL HERE          <!-- span's "tail" -->
17//! </div>
18//! ```
19
20use dom_query::Selection;
21pub use dom_query::Document;
22
23/// Get text before first child element.
24///
25/// Returns text nodes that appear before any child element.
26#[must_use]
27pub fn text(sel: &Selection) -> String {
28    let mut result = String::new();
29
30    if let Some(node) = sel.nodes().first() {
31        for child in node.children() {
32            if child.is_element() {
33                break; // Stop at first element
34            }
35            if child.is_text() {
36                let text_content = child.text();
37                result.push_str(&text_content);
38            }
39        }
40    }
41
42    result
43}
44
45/// Get text after element's closing tag (tail).
46///
47/// Returns text nodes that follow this element until the next sibling element.
48#[must_use]
49pub fn tail(sel: &Selection) -> String {
50    let mut result = String::new();
51
52    if let Some(node) = sel.nodes().first() {
53        let mut next = node.next_sibling();
54
55        while let Some(sibling) = next {
56            if sibling.is_element() {
57                break; // Stop at next element
58            }
59            if sibling.is_text() {
60                let text_content = sibling.text();
61                result.push_str(&text_content);
62            }
63            next = sibling.next_sibling();
64        }
65    }
66
67    result
68}
69
70/// Set text before first child element.
71///
72/// Removes existing pre-element text and inserts new text.
73pub fn set_text(sel: &Selection, new_text: &str) {
74    // Remove existing text nodes before first element
75    if let Some(node) = sel.nodes().first() {
76        let mut to_remove = Vec::new();
77
78        for child in node.children() {
79            if child.is_element() {
80                break;
81            }
82            if child.is_text() {
83                to_remove.push(child);
84            }
85        }
86
87        for text_node in to_remove {
88            Selection::from(text_node).remove();
89        }
90    }
91
92    // Prepend new text
93    if !new_text.is_empty() {
94        let escaped = escape_html(new_text);
95        sel.prepend_html(escaped.as_str());
96    }
97}
98
99/// Set tail text after element.
100///
101/// Removes existing tail text nodes and inserts new text after element.
102pub fn set_tail(sel: &Selection, new_tail: &str) {
103    // Remove existing tail nodes using helper
104    for tail_node in tail_nodes(sel) {
105        Selection::from(tail_node).remove();
106    }
107
108    // Insert new tail text after element
109    if !new_tail.is_empty() {
110        let escaped = escape_html(new_tail);
111        sel.after_html(escaped.as_str());
112    }
113}
114
115/// Get all tail text nodes for an element.
116///
117/// Returns a vector of text nodes that follow this element.
118#[must_use]
119pub fn tail_nodes<'a>(sel: &Selection<'a>) -> Vec<dom_query::NodeRef<'a>> {
120    let mut nodes = Vec::new();
121
122    if let Some(node) = sel.nodes().first() {
123        let mut next = node.next_sibling();
124
125        while let Some(sibling) = next {
126            if sibling.is_element() {
127                break;
128            }
129            if sibling.is_text() {
130                nodes.push(sibling);
131            }
132            next = sibling.next_sibling();
133        }
134    }
135
136    nodes
137}
138
139/// Check if tag is a void element (self-closing).
140///
141/// Void elements like `<br>`, `<hr>`, `<img>` cannot have children.
142#[must_use]
143pub fn is_void_element(tag: &str) -> bool {
144    matches!(
145        tag.to_lowercase().as_str(),
146        "area" | "base" | "br" | "col" | "embed" | "hr" | "img"
147            | "input" | "link" | "meta" | "param" | "source" | "track" | "wbr"
148    )
149}
150
151/// Escape HTML entities for safe insertion.
152fn escape_html(text: &str) -> String {
153    text.replace('&', "&amp;")
154        .replace('<', "&lt;")
155        .replace('>', "&gt;")
156        .replace('"', "&quot;")
157        .replace('\'', "&#39;")
158}
159
160/// Get all text content with separator at level changes.
161#[must_use]
162pub fn iter_text(sel: &Selection, separator: &str) -> String {
163    let mut result = String::new();
164    let mut last_level = 0;
165
166    if let Some(node) = sel.nodes().first() {
167        traverse_for_text(node, 0, &mut last_level, separator, &mut result);
168    }
169
170    result.trim().to_string()
171}
172
173fn traverse_for_text(
174    node: &dom_query::NodeRef,
175    level: usize,
176    last_level: &mut usize,
177    sep: &str,
178    result: &mut String,
179) {
180    if node.is_text() {
181        if level != *last_level && !result.is_empty() {
182            result.push_str(sep);
183        }
184        let text_content = node.text();
185        result.push_str(&text_content);
186    } else if node.is_element() {
187        // Check if void element - add separator
188        if let Some(tag) = node.node_name() {
189            if is_void_element(&tag) && !result.is_empty() {
190                result.push_str(sep);
191            }
192        }
193    }
194    *last_level = level;
195
196    for child in node.children() {
197        traverse_for_text(&child, level + 1, last_level, sep, result);
198    }
199}
200
201/// Create a new element as a Document.
202///
203/// Table elements (tr, th, td, tbody, thead, tfoot) are wrapped in proper
204/// table context for correct HTML parsing.
205#[must_use]
206pub fn element(tag: &str) -> Document {
207    // Table elements need to be wrapped in proper context for parsing
208    match tag.to_lowercase().as_str() {
209        "tr" | "th" | "td" | "tbody" | "thead" | "tfoot" => {
210            Document::from(format!("<table><{tag}></{tag}></table>"))
211        }
212        _ => Document::from(format!("<{tag}></{tag}>")),
213    }
214}
215
216/// Create child element and append to parent.
217#[must_use]
218pub fn sub_element<'a>(parent: &Selection<'a>, tag: &str) -> Selection<'a> {
219    let html = format!("<{tag}></{tag}>");
220    parent.append_html(html.as_str());
221    parent.children().last()
222}
223
224/// Remove element from tree.
225///
226/// # Arguments
227/// * `sel` - Element to remove
228/// * `keep_tail` - If true, preserve tail text
229pub fn remove(sel: &Selection, keep_tail: bool) {
230    if !keep_tail {
231        // Also remove tail text nodes
232        if let Some(node) = sel.nodes().first() {
233            let mut next = node.next_sibling();
234            let mut to_remove = Vec::new();
235
236            while let Some(sibling) = next {
237                if sibling.is_element() {
238                    break;
239                }
240                if sibling.is_text() {
241                    to_remove.push(sibling);
242                }
243                next = sibling.next_sibling();
244            }
245
246            for text_node in to_remove {
247                Selection::from(text_node).remove();
248            }
249        }
250    }
251    sel.remove();
252}
253
254/// Strip element but keep children.
255///
256/// Moves children to parent, then removes the element.
257pub fn strip(sel: &Selection) {
258    if let Some(node) = sel.nodes().first() {
259        // Move first child (and all its siblings) before this node
260        if let Some(first_child) = node.first_child() {
261            node.insert_siblings_before(&first_child);
262        }
263        // Remove the now-empty element
264        node.remove_from_parent();
265    }
266}
267
268/// Check if tag is safe to use as a CSS selector.
269///
270/// Prevents CSS injection by ensuring tag contains only valid characters.
271fn is_safe_tag_selector(tag: &str) -> bool {
272    !tag.is_empty()
273        && tag
274            .chars()
275            .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
276}
277
278/// Remove all elements with given tags.
279pub fn strip_elements(tree: &Selection, keep_tail: bool, tags: &[&str]) {
280    for tag in tags {
281        let nodes: Vec<_> = if is_safe_tag_selector(tag) {
282            tree.select(tag).nodes().to_vec()
283        } else {
284            // Fallback to manual filtering for unsafe selectors
285            let target = tag.to_ascii_lowercase();
286            tree.select("*")
287                .nodes()
288                .iter()
289                .copied()
290                .filter(|n| {
291                    n.node_name()
292                        .is_some_and(|name| name.to_ascii_lowercase() == target)
293                })
294                .collect()
295        };
296        for node in nodes.into_iter().rev() {
297            let sel = Selection::from(node);
298            remove(&sel, keep_tail);
299        }
300    }
301}
302
303/// Iterate elements matching tags.
304#[must_use]
305pub fn iter<'a>(sel: &Selection<'a>, tags: &[&str]) -> Selection<'a> {
306    if tags.is_empty() {
307        sel.select("*")
308    } else {
309        sel.select(&tags.join(","))
310    }
311}
312
313/// Like `iter` but excludes the element itself.
314#[must_use]
315pub fn iter_descendants<'a>(sel: &Selection<'a>, tags: &[&str]) -> Selection<'a> {
316    // select() already excludes self, so same as iter
317    iter(sel, tags)
318}
319
320/// Strip tags from selection, keeping their content.
321///
322/// Similar to `strip_elements` but uses `strip()` instead of `remove()`.
323pub fn strip_tags(tree: &Selection, tags: &[&str]) {
324    for tag in tags {
325        let nodes: Vec<_> = if is_safe_tag_selector(tag) {
326            tree.select(tag).nodes().to_vec()
327        } else {
328            // Fallback to manual filtering for unsafe selectors
329            let target = tag.to_ascii_lowercase();
330            tree.select("*")
331                .nodes()
332                .iter()
333                .copied()
334                .filter(|n| {
335                    n.node_name()
336                        .is_some_and(|name| name.to_ascii_lowercase() == target)
337                })
338                .collect()
339        };
340        for node in nodes.into_iter().rev() {
341            let sel = Selection::from(node);
342            strip(&sel);
343        }
344    }
345}
346
347/// Append child element.
348pub fn append(parent: &Selection, child: &Selection) {
349    parent.append_selection(child);
350}
351
352/// Append multiple children.
353pub fn extend(parent: &Selection, children: &[&Selection]) {
354    for child in children {
355        append(parent, child);
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_text_before_children() {
365        let doc = Document::from("<div>Hello <span>World</span></div>");
366        let div = doc.select("div");
367        assert_eq!(text(&div), "Hello ");
368    }
369
370    #[test]
371    fn test_text_no_children() {
372        let doc = Document::from("<p>Just text</p>");
373        let p = doc.select("p");
374        assert_eq!(text(&p), "Just text");
375    }
376
377    #[test]
378    fn test_text_empty() {
379        let doc = Document::from("<div><span>only child</span></div>");
380        let div = doc.select("div");
381        assert_eq!(text(&div), "");
382    }
383
384    #[test]
385    fn test_tail_after_element() {
386        let doc = Document::from("<div><span>inner</span> tail text</div>");
387        let span = doc.select("span");
388        assert_eq!(tail(&span), " tail text");
389    }
390
391    #[test]
392    fn test_tail_no_tail() {
393        let doc = Document::from("<div><span>inner</span></div>");
394        let span = doc.select("span");
395        assert_eq!(tail(&span), "");
396    }
397
398    #[test]
399    fn test_tail_stops_at_next_element() {
400        let doc = Document::from("<div><span>1</span> tail <span>2</span></div>");
401        let first_span = doc.select("span").first();
402        assert_eq!(tail(&first_span), " tail ");
403    }
404
405    #[test]
406    fn test_tail_nodes() {
407        let doc = Document::from("<div><span>1</span> text1 text2 <span>2</span></div>");
408        let first_span = doc.select("span").first();
409        let nodes = tail_nodes(&first_span);
410        assert!(!nodes.is_empty());
411    }
412
413    #[test]
414    fn test_set_text() {
415        let doc = Document::from("<div>Old text<span>child</span></div>");
416        let div = doc.select("div");
417        set_text(&div, "New text");
418        assert_eq!(text(&div), "New text");
419        assert!(doc.select("span").exists());
420    }
421
422    #[test]
423    fn test_set_tail() {
424        let doc = Document::from("<div><span>inner</span>Old tail</div>");
425        let span = doc.select("span");
426        set_tail(&span, "New tail");
427        assert_eq!(tail(&span), "New tail");
428    }
429
430    #[test]
431    fn test_element_creation() {
432        let doc = element("p");
433        assert!(doc.select("p").exists());
434    }
435
436    #[test]
437    fn test_sub_element() {
438        let doc = Document::from("<div></div>");
439        let div = doc.select("div");
440        let _span = sub_element(&div, "span");
441        assert!(doc.select("div > span").exists());
442    }
443
444    #[test]
445    fn test_remove_with_tail() {
446        let doc = Document::from("<div>text <span>remove</span> keep this</div>");
447        let span = doc.select("span");
448        remove(&span, true); // Keep tail
449        assert!(doc.select("span").is_empty());
450        assert!(doc.select("div").text().contains("keep this"));
451    }
452
453    #[test]
454    fn test_remove_without_tail() {
455        let doc = Document::from("<div>text <span>remove</span> remove this too</div>");
456        let span = doc.select("span");
457        remove(&span, false); // Remove tail too
458        let div_text = doc.select("div").text().to_string();
459        assert!(!div_text.contains("remove this"));
460    }
461
462    #[test]
463    fn test_strip() {
464        let doc = Document::from("<div><p><span>inner</span> text</p></div>");
465        let p = doc.select("p");
466        strip(&p);
467        assert!(doc.select("p").is_empty());
468        assert!(doc.select("span").exists());
469    }
470
471    #[test]
472    fn test_strip_preserves_children() {
473        let doc = Document::from("<div><p><span>inner</span> text</p></div>");
474        let p = doc.select("p");
475        strip(&p);
476        assert!(doc.select("p").is_empty());
477        assert_eq!(doc.select("span").length(), 1);
478        assert!(doc.select("div").text().contains("inner"));
479    }
480
481    #[test]
482    fn test_strip_empty_element() {
483        let doc = Document::from("<div><p></p><span>kept</span></div>");
484        let p = doc.select("p");
485        strip(&p);
486        assert!(doc.select("p").is_empty());
487        assert_eq!(doc.select("span").length(), 1);
488    }
489
490    #[test]
491    fn test_strip_elements_keep_tail() {
492        let doc = Document::from("<div><b>bold</b> tail<i>italic</i> more</div>");
493        let div = doc.select("div");
494        strip_elements(&div, true, &["b", "i"]);
495        assert!(doc.select("b").is_empty());
496        assert!(doc.select("i").is_empty());
497        let text_result = div.text().to_string();
498        assert!(text_result.contains("tail"));
499        assert!(text_result.contains("more"));
500    }
501
502    #[test]
503    fn test_strip_elements_remove_tail() {
504        let doc = Document::from("<div><b>bold</b> tail<i>italic</i> more</div>");
505        let div = doc.select("div");
506        strip_elements(&div, false, &["b", "i"]);
507        assert!(doc.select("b").is_empty());
508        assert!(doc.select("i").is_empty());
509        let text_result = div.text().to_string();
510        assert!(!text_result.contains("tail"));
511        assert!(!text_result.contains("more"));
512    }
513
514    #[test]
515    fn test_strip_tags() {
516        let doc = Document::from("<div><b>bold</b> text <i>italic</i></div>");
517        let div = doc.select("div");
518        strip_tags(&div, &["b", "i"]);
519        assert!(doc.select("b").is_empty());
520        assert!(doc.select("i").is_empty());
521        // Content should be preserved
522        let text_result = div.text().to_string();
523        assert!(text_result.contains("bold"));
524        assert!(text_result.contains("italic"));
525    }
526
527    #[test]
528    fn test_iter_all_elements() {
529        let doc = Document::from("<div><p>1</p><span>2</span><p>3</p></div>");
530        let div = doc.select("div");
531        let all = iter(&div, &[]);
532        assert_eq!(all.length(), 3); // p, span, p
533    }
534
535    #[test]
536    fn test_iter_filtered() {
537        let doc = Document::from("<div><p>1</p><span>2</span><p>3</p></div>");
538        let div = doc.select("div");
539        let only_p = iter(&div, &["p"]);
540        assert_eq!(only_p.length(), 2); // Both p tags
541    }
542
543    #[test]
544    fn test_iter_text_with_separator() {
545        let doc = Document::from("<p>Hello<span>World</span>!</p>");
546        let p = doc.select("p");
547        let result = iter_text(&p, " ");
548        assert_eq!(result, "Hello World !");
549    }
550
551    #[test]
552    fn test_is_void_element() {
553        assert!(is_void_element("br"));
554        assert!(is_void_element("BR"));
555        assert!(is_void_element("img"));
556        assert!(is_void_element("hr"));
557        assert!(!is_void_element("div"));
558        assert!(!is_void_element("span"));
559    }
560
561    #[test]
562    fn test_extend() {
563        let doc = Document::from("<div></div>");
564        let div = doc.select("div");
565
566        let doc1 = Document::from("<span>1</span>");
567        let child1 = doc1.select("span");
568        let doc2 = Document::from("<span>2</span>");
569        let child2 = doc2.select("span");
570
571        extend(&div, &[&child1, &child2]);
572
573        assert_eq!(doc.select("div > span").length(), 2);
574    }
575}