scrape_core/query/
extraction.rs1use super::{QueryResult, find_all, find_all_within, parse_selector};
4use crate::dom::{Document, NodeId, NodeKind};
5
6pub fn select_text(doc: &Document, selector: &str) -> QueryResult<Vec<String>> {
25 let selector_list = parse_selector(selector)?;
26 let node_ids = find_all(doc, selector)?;
27
28 Ok(node_ids.into_iter().map(|id| extract_text(doc, id)).collect())
29}
30
31pub fn select_text_within(
37 doc: &Document,
38 root: NodeId,
39 selector: &str,
40) -> QueryResult<Vec<String>> {
41 let selector_list = parse_selector(selector)?;
42 let node_ids = find_all_within(doc, root, selector)?;
43
44 Ok(node_ids.into_iter().map(|id| extract_text(doc, id)).collect())
45}
46
47pub fn select_attr(doc: &Document, selector: &str, attr: &str) -> QueryResult<Vec<Option<String>>> {
66 let selector_list = parse_selector(selector)?;
67 let node_ids = find_all(doc, selector)?;
68
69 Ok(node_ids.into_iter().map(|id| extract_attr(doc, id, attr)).collect())
70}
71
72pub fn select_attr_within(
78 doc: &Document,
79 root: NodeId,
80 selector: &str,
81 attr: &str,
82) -> QueryResult<Vec<Option<String>>> {
83 let selector_list = parse_selector(selector)?;
84 let node_ids = find_all_within(doc, root, selector)?;
85
86 Ok(node_ids.into_iter().map(|id| extract_attr(doc, id, attr)).collect())
87}
88
89fn extract_text(doc: &Document, root: NodeId) -> String {
91 let mut text = String::new();
92 collect_text(doc, root, &mut text);
93 text
94}
95
96fn collect_text(doc: &Document, node_id: NodeId, buffer: &mut String) {
98 let Some(node) = doc.get(node_id) else {
99 return;
100 };
101
102 match &node.kind {
103 NodeKind::Text { content } => {
104 buffer.push_str(content);
105 }
106 NodeKind::Element { .. } => {
107 for child_id in doc.children(node_id) {
108 collect_text(doc, child_id, buffer);
109 }
110 }
111 NodeKind::Comment { .. } => {}
112 }
113}
114
115fn extract_attr(doc: &Document, node_id: NodeId, attr: &str) -> Option<String> {
117 let node = doc.get(node_id)?;
118
119 if let NodeKind::Element { attributes, .. } = &node.kind {
120 attributes.get(attr).cloned()
121 } else {
122 None
123 }
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129 use crate::Soup;
130
131 #[test]
132 fn test_select_text_simple() {
133 let soup = Soup::parse("<div><span>A</span><span>B</span></div>");
134 let texts = select_text(soup.document(), "span").unwrap();
135 assert_eq!(texts, vec!["A", "B"]);
136 }
137
138 #[test]
139 fn test_select_text_nested() {
140 let soup = Soup::parse("<p>Hello <b>World</b>!</p>");
141 let texts = select_text(soup.document(), "p").unwrap();
142 assert_eq!(texts, vec!["Hello World!"]);
143 }
144
145 #[test]
146 fn test_select_text_no_matches() {
147 let soup = Soup::parse("<div>text</div>");
148 let texts = select_text(soup.document(), "span").unwrap();
149 assert!(texts.is_empty());
150 }
151
152 #[test]
153 fn test_select_text_multiple_elements() {
154 let soup = Soup::parse("<ul><li>First</li><li>Second</li><li>Third</li></ul>");
155 let texts = select_text(soup.document(), "li").unwrap();
156 assert_eq!(texts, vec!["First", "Second", "Third"]);
157 }
158
159 #[test]
160 fn test_select_text_deeply_nested() {
161 let soup = Soup::parse("<div><p><span>Deep</span></p></div>");
162 let texts = select_text(soup.document(), "div").unwrap();
163 assert_eq!(texts, vec!["Deep"]);
164 }
165
166 #[test]
167 fn test_select_text_invalid_selector() {
168 let soup = Soup::parse("<div>text</div>");
169 let result = select_text(soup.document(), "[");
170 assert!(result.is_err());
171 }
172
173 #[test]
174 fn test_select_attr_simple() {
175 let soup = Soup::parse("<a href='/a'>A</a><a href='/b'>B</a>");
176 let hrefs = select_attr(soup.document(), "a", "href").unwrap();
177 assert_eq!(hrefs, vec![Some("/a".to_string()), Some("/b".to_string())]);
178 }
179
180 #[test]
181 fn test_select_attr_missing() {
182 let soup = Soup::parse("<a href='/a'>A</a><a>B</a>");
183 let hrefs = select_attr(soup.document(), "a", "href").unwrap();
184 assert_eq!(hrefs, vec![Some("/a".to_string()), None]);
185 }
186
187 #[test]
188 fn test_select_attr_no_matches() {
189 let soup = Soup::parse("<div>text</div>");
190 let hrefs = select_attr(soup.document(), "a", "href").unwrap();
191 assert!(hrefs.is_empty());
192 }
193
194 #[test]
195 fn test_select_attr_different_attributes() {
196 let soup = Soup::parse(r#"<img src="/a.png" alt="A"><img src="/b.png" alt="B">"#);
197
198 let srcs = select_attr(soup.document(), "img", "src").unwrap();
199 assert_eq!(srcs, vec![Some("/a.png".to_string()), Some("/b.png".to_string())]);
200
201 let alts = select_attr(soup.document(), "img", "alt").unwrap();
202 assert_eq!(alts, vec![Some("A".to_string()), Some("B".to_string())]);
203 }
204
205 #[test]
206 fn test_select_attr_invalid_selector() {
207 let soup = Soup::parse("<a href='/a'>A</a>");
208 let result = select_attr(soup.document(), "[", "href");
209 assert!(result.is_err());
210 }
211
212 #[test]
213 fn test_select_text_within() {
214 let soup = Soup::parse("<div><ul><li>A</li><li>B</li></ul><p>C</p></div>");
215 let div = soup.find("ul").unwrap().unwrap();
216 let texts = select_text_within(soup.document(), div.node_id(), "li").unwrap();
217 assert_eq!(texts, vec!["A", "B"]);
218 }
219
220 #[test]
221 fn test_select_attr_within() {
222 let soup =
223 Soup::parse(r#"<nav><a href="/1">1</a><a href="/2">2</a></nav><a href="/3">3</a>"#);
224 let nav = soup.find("nav").unwrap().unwrap();
225 let hrefs = select_attr_within(soup.document(), nav.node_id(), "a", "href").unwrap();
226 assert_eq!(hrefs, vec![Some("/1".to_string()), Some("/2".to_string())]);
227 }
228
229 #[test]
230 fn test_select_text_empty_element() {
231 let soup = Soup::parse("<div></div>");
232 let texts = select_text(soup.document(), "div").unwrap();
233 assert_eq!(texts, vec![""]);
234 }
235
236 #[test]
237 fn test_select_text_whitespace_preserved() {
238 let soup = Soup::parse("<span> Hello </span>");
239 let texts = select_text(soup.document(), "span").unwrap();
240 assert!(!texts.is_empty());
242 }
243}