scrape_core/query/
text.rs1use crate::dom::{Document, NodeId, NodeKind};
4
5pub struct TextNodesIter<'a> {
22 doc: &'a Document,
23 stack: Vec<NodeId>,
24}
25
26impl<'a> TextNodesIter<'a> {
27 #[must_use]
29 pub fn new(doc: &'a Document, root: NodeId) -> Self {
30 Self { doc, stack: vec![root] }
31 }
32}
33
34impl<'a> Iterator for TextNodesIter<'a> {
35 type Item = &'a str;
36
37 fn next(&mut self) -> Option<Self::Item> {
38 while let Some(id) = self.stack.pop() {
39 let Some(node) = self.doc.get(id) else {
40 continue;
41 };
42
43 match &node.kind {
44 NodeKind::Text { content } => {
45 return Some(content.as_str());
46 }
47 NodeKind::Element { .. } => {
48 #[allow(clippy::needless_collect)]
51 let children: Vec<_> = self.doc.children(id).collect();
52 for child_id in children.into_iter().rev() {
53 self.stack.push(child_id);
54 }
55 }
56 NodeKind::Comment { .. } => {}
57 }
58 }
59 None
60 }
61}
62
63#[cfg(test)]
64mod tests {
65 use crate::{Soup, SoupConfig};
66
67 #[test]
68 fn test_text_nodes_single_text() {
69 let soup = Soup::parse("<div>Hello</div>");
70 let div = soup.find("div").unwrap().unwrap();
71 let texts: Vec<_> = div.text_nodes().collect();
72 assert_eq!(texts, vec!["Hello"]);
73 }
74
75 #[test]
76 fn test_text_nodes_nested() {
77 let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
78 let div = soup.find("div").unwrap().unwrap();
79 let texts: Vec<_> = div.text_nodes().collect();
80 assert_eq!(texts, vec!["Hello ", "World", "!"]);
81 }
82
83 #[test]
84 fn test_text_nodes_empty_element() {
85 let soup = Soup::parse("<div></div>");
86 let div = soup.find("div").unwrap().unwrap();
87 assert!(div.text_nodes().next().is_none());
88 }
89
90 #[test]
91 fn test_text_nodes_skips_comments() {
92 let config = SoupConfig::builder().include_comments(true).build();
93 let soup = Soup::parse_with_config("<div>A<!--comment-->B</div>", config);
94 let div = soup.find("div").unwrap().unwrap();
95 let texts: Vec<_> = div.text_nodes().collect();
96 assert_eq!(texts, vec!["A", "B"]);
97 }
98
99 #[test]
100 fn test_text_nodes_deeply_nested() {
101 let soup = Soup::parse("<div><p><span>A</span></p><p><span>B</span></p></div>");
102 let div = soup.find("div").unwrap().unwrap();
103 let texts: Vec<_> = div.text_nodes().collect();
104 assert_eq!(texts, vec!["A", "B"]);
105 }
106
107 #[test]
108 fn test_text_nodes_depth_first_order() {
109 let soup = Soup::parse("<div>1<span>2<b>3</b>4</span>5</div>");
110 let div = soup.find("div").unwrap().unwrap();
111 let texts: Vec<_> = div.text_nodes().collect();
112 assert_eq!(texts, vec!["1", "2", "3", "4", "5"]);
113 }
114
115 #[test]
116 fn test_text_nodes_whitespace() {
117 let config = SoupConfig::builder().preserve_whitespace(true).build();
118 let soup = Soup::parse_with_config("<div> A </div>", config);
119 let div = soup.find("div").unwrap().unwrap();
120 let texts: Vec<_> = div.text_nodes().collect();
121 assert_eq!(texts.len(), 1);
122 assert!(texts[0].contains(" A "));
123 }
124}