Skip to main content

scrape_core/query/
text.rs

1//! Iterator over text nodes in a DOM subtree.
2
3use crate::dom::{Document, NodeId, NodeKind};
4
5/// Iterator over text content within an element subtree.
6///
7/// Returns only text node content, skipping element tags and comments.
8/// Iterates in depth-first order.
9///
10/// # Examples
11///
12/// ```rust
13/// use scrape_core::Soup;
14///
15/// let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
16/// if let Ok(Some(div)) = soup.find("div") {
17///     let texts: Vec<_> = div.text_nodes().collect();
18///     assert_eq!(texts, vec!["Hello ", "World", "!"]);
19/// }
20/// ```
21pub struct TextNodesIter<'a> {
22    doc: &'a Document,
23    stack: Vec<NodeId>,
24}
25
26impl<'a> TextNodesIter<'a> {
27    /// Creates a new text nodes iterator rooted at the given node.
28    #[must_use]
29    pub fn new(doc: &'a Document, root: NodeId) -> Self {
30        Self { doc, stack: vec![root] }
31    }
32}
33
34impl<'a> Iterator for TextNodesIter<'a> {
35    type Item = &'a str;
36
37    fn next(&mut self) -> Option<Self::Item> {
38        while let Some(id) = self.stack.pop() {
39            let Some(node) = self.doc.get(id) else {
40                continue;
41            };
42
43            match &node.kind {
44                NodeKind::Text { content } => {
45                    return Some(content.as_str());
46                }
47                NodeKind::Element { .. } => {
48                    // Push children in reverse order for depth-first traversal
49                    // Collect required: ChildrenIter does not implement DoubleEndedIterator
50                    #[allow(clippy::needless_collect)]
51                    let children: Vec<_> = self.doc.children(id).collect();
52                    for child_id in children.into_iter().rev() {
53                        self.stack.push(child_id);
54                    }
55                }
56                NodeKind::Comment { .. } => {}
57            }
58        }
59        None
60    }
61}
62
63#[cfg(test)]
64mod tests {
65    use crate::{Soup, SoupConfig};
66
67    #[test]
68    fn test_text_nodes_single_text() {
69        let soup = Soup::parse("<div>Hello</div>");
70        let div = soup.find("div").unwrap().unwrap();
71        let texts: Vec<_> = div.text_nodes().collect();
72        assert_eq!(texts, vec!["Hello"]);
73    }
74
75    #[test]
76    fn test_text_nodes_nested() {
77        let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
78        let div = soup.find("div").unwrap().unwrap();
79        let texts: Vec<_> = div.text_nodes().collect();
80        assert_eq!(texts, vec!["Hello ", "World", "!"]);
81    }
82
83    #[test]
84    fn test_text_nodes_empty_element() {
85        let soup = Soup::parse("<div></div>");
86        let div = soup.find("div").unwrap().unwrap();
87        assert!(div.text_nodes().next().is_none());
88    }
89
90    #[test]
91    fn test_text_nodes_skips_comments() {
92        let config = SoupConfig::builder().include_comments(true).build();
93        let soup = Soup::parse_with_config("<div>A<!--comment-->B</div>", config);
94        let div = soup.find("div").unwrap().unwrap();
95        let texts: Vec<_> = div.text_nodes().collect();
96        assert_eq!(texts, vec!["A", "B"]);
97    }
98
99    #[test]
100    fn test_text_nodes_deeply_nested() {
101        let soup = Soup::parse("<div><p><span>A</span></p><p><span>B</span></p></div>");
102        let div = soup.find("div").unwrap().unwrap();
103        let texts: Vec<_> = div.text_nodes().collect();
104        assert_eq!(texts, vec!["A", "B"]);
105    }
106
107    #[test]
108    fn test_text_nodes_depth_first_order() {
109        let soup = Soup::parse("<div>1<span>2<b>3</b>4</span>5</div>");
110        let div = soup.find("div").unwrap().unwrap();
111        let texts: Vec<_> = div.text_nodes().collect();
112        assert_eq!(texts, vec!["1", "2", "3", "4", "5"]);
113    }
114
115    #[test]
116    fn test_text_nodes_whitespace() {
117        let config = SoupConfig::builder().preserve_whitespace(true).build();
118        let soup = Soup::parse_with_config("<div>  A  </div>", config);
119        let div = soup.find("div").unwrap().unwrap();
120        let texts: Vec<_> = div.text_nodes().collect();
121        assert_eq!(texts.len(), 1);
122        assert!(texts[0].contains("  A  "));
123    }
124}