scrape_core/query/
extraction.rs

1//! Text and attribute extraction from query results.
2
3use super::{QueryResult, find_all, find_all_within, parse_selector};
4use crate::dom::{Document, NodeId, NodeKind};
5
6/// Extracts text content from all elements matching a CSS selector.
7///
8/// Returns the concatenated text content of each matching element.
9/// Empty vector if no elements match.
10///
11/// # Errors
12///
13/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
14///
15/// # Examples
16///
17/// ```rust
18/// use scrape_core::{Soup, query::select_text};
19///
20/// let soup = Soup::parse("<div><span>A</span><span>B</span></div>");
21/// let texts = select_text(soup.document(), "span").unwrap();
22/// assert_eq!(texts, vec!["A", "B"]);
23/// ```
24pub fn select_text(doc: &Document, selector: &str) -> QueryResult<Vec<String>> {
25    let selector_list = parse_selector(selector)?;
26    let node_ids = find_all(doc, selector)?;
27
28    Ok(node_ids.into_iter().map(|id| extract_text(doc, id)).collect())
29}
30
31/// Extracts text content from elements within a subtree matching a CSS selector.
32///
33/// # Errors
34///
35/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
36pub fn select_text_within(
37    doc: &Document,
38    root: NodeId,
39    selector: &str,
40) -> QueryResult<Vec<String>> {
41    let selector_list = parse_selector(selector)?;
42    let node_ids = find_all_within(doc, root, selector)?;
43
44    Ok(node_ids.into_iter().map(|id| extract_text(doc, id)).collect())
45}
46
47/// Extracts attribute values from all elements matching a CSS selector.
48///
49/// Returns `Some(value)` if the attribute exists, `None` if it doesn't.
50/// Empty vector if no elements match.
51///
52/// # Errors
53///
54/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
55///
56/// # Examples
57///
58/// ```rust
59/// use scrape_core::{Soup, query::select_attr};
60///
61/// let soup = Soup::parse("<a href='/a'>A</a><a href='/b'>B</a>");
62/// let hrefs = select_attr(soup.document(), "a", "href").unwrap();
63/// assert_eq!(hrefs, vec![Some("/a".to_string()), Some("/b".to_string())]);
64/// ```
65pub fn select_attr(doc: &Document, selector: &str, attr: &str) -> QueryResult<Vec<Option<String>>> {
66    let selector_list = parse_selector(selector)?;
67    let node_ids = find_all(doc, selector)?;
68
69    Ok(node_ids.into_iter().map(|id| extract_attr(doc, id, attr)).collect())
70}
71
72/// Extracts attribute values from elements within a subtree matching a CSS selector.
73///
74/// # Errors
75///
76/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
77pub fn select_attr_within(
78    doc: &Document,
79    root: NodeId,
80    selector: &str,
81    attr: &str,
82) -> QueryResult<Vec<Option<String>>> {
83    let selector_list = parse_selector(selector)?;
84    let node_ids = find_all_within(doc, root, selector)?;
85
86    Ok(node_ids.into_iter().map(|id| extract_attr(doc, id, attr)).collect())
87}
88
89/// Extracts all text content from an element and its descendants.
90fn extract_text(doc: &Document, root: NodeId) -> String {
91    let mut text = String::new();
92    collect_text(doc, root, &mut text);
93    text
94}
95
96/// Recursively collects text from a node and its descendants.
97fn collect_text(doc: &Document, node_id: NodeId, buffer: &mut String) {
98    let Some(node) = doc.get(node_id) else {
99        return;
100    };
101
102    match &node.kind {
103        NodeKind::Text { content } => {
104            buffer.push_str(content);
105        }
106        NodeKind::Element { .. } => {
107            for child_id in doc.children(node_id) {
108                collect_text(doc, child_id, buffer);
109            }
110        }
111        NodeKind::Comment { .. } => {}
112    }
113}
114
115/// Extracts an attribute value from an element.
116fn extract_attr(doc: &Document, node_id: NodeId, attr: &str) -> Option<String> {
117    let node = doc.get(node_id)?;
118
119    if let NodeKind::Element { attributes, .. } = &node.kind {
120        attributes.get(attr).cloned()
121    } else {
122        None
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129    use crate::Soup;
130
131    #[test]
132    fn test_select_text_simple() {
133        let soup = Soup::parse("<div><span>A</span><span>B</span></div>");
134        let texts = select_text(soup.document(), "span").unwrap();
135        assert_eq!(texts, vec!["A", "B"]);
136    }
137
138    #[test]
139    fn test_select_text_nested() {
140        let soup = Soup::parse("<p>Hello <b>World</b>!</p>");
141        let texts = select_text(soup.document(), "p").unwrap();
142        assert_eq!(texts, vec!["Hello World!"]);
143    }
144
145    #[test]
146    fn test_select_text_no_matches() {
147        let soup = Soup::parse("<div>text</div>");
148        let texts = select_text(soup.document(), "span").unwrap();
149        assert!(texts.is_empty());
150    }
151
152    #[test]
153    fn test_select_text_multiple_elements() {
154        let soup = Soup::parse("<ul><li>First</li><li>Second</li><li>Third</li></ul>");
155        let texts = select_text(soup.document(), "li").unwrap();
156        assert_eq!(texts, vec!["First", "Second", "Third"]);
157    }
158
159    #[test]
160    fn test_select_text_deeply_nested() {
161        let soup = Soup::parse("<div><p><span>Deep</span></p></div>");
162        let texts = select_text(soup.document(), "div").unwrap();
163        assert_eq!(texts, vec!["Deep"]);
164    }
165
166    #[test]
167    fn test_select_text_invalid_selector() {
168        let soup = Soup::parse("<div>text</div>");
169        let result = select_text(soup.document(), "[");
170        assert!(result.is_err());
171    }
172
173    #[test]
174    fn test_select_attr_simple() {
175        let soup = Soup::parse("<a href='/a'>A</a><a href='/b'>B</a>");
176        let hrefs = select_attr(soup.document(), "a", "href").unwrap();
177        assert_eq!(hrefs, vec![Some("/a".to_string()), Some("/b".to_string())]);
178    }
179
180    #[test]
181    fn test_select_attr_missing() {
182        let soup = Soup::parse("<a href='/a'>A</a><a>B</a>");
183        let hrefs = select_attr(soup.document(), "a", "href").unwrap();
184        assert_eq!(hrefs, vec![Some("/a".to_string()), None]);
185    }
186
187    #[test]
188    fn test_select_attr_no_matches() {
189        let soup = Soup::parse("<div>text</div>");
190        let hrefs = select_attr(soup.document(), "a", "href").unwrap();
191        assert!(hrefs.is_empty());
192    }
193
194    #[test]
195    fn test_select_attr_different_attributes() {
196        let soup = Soup::parse(r#"<img src="/a.png" alt="A"><img src="/b.png" alt="B">"#);
197
198        let srcs = select_attr(soup.document(), "img", "src").unwrap();
199        assert_eq!(srcs, vec![Some("/a.png".to_string()), Some("/b.png".to_string())]);
200
201        let alts = select_attr(soup.document(), "img", "alt").unwrap();
202        assert_eq!(alts, vec![Some("A".to_string()), Some("B".to_string())]);
203    }
204
205    #[test]
206    fn test_select_attr_invalid_selector() {
207        let soup = Soup::parse("<a href='/a'>A</a>");
208        let result = select_attr(soup.document(), "[", "href");
209        assert!(result.is_err());
210    }
211
212    #[test]
213    fn test_select_text_within() {
214        let soup = Soup::parse("<div><ul><li>A</li><li>B</li></ul><p>C</p></div>");
215        let div = soup.find("ul").unwrap().unwrap();
216        let texts = select_text_within(soup.document(), div.node_id(), "li").unwrap();
217        assert_eq!(texts, vec!["A", "B"]);
218    }
219
220    #[test]
221    fn test_select_attr_within() {
222        let soup =
223            Soup::parse(r#"<nav><a href="/1">1</a><a href="/2">2</a></nav><a href="/3">3</a>"#);
224        let nav = soup.find("nav").unwrap().unwrap();
225        let hrefs = select_attr_within(soup.document(), nav.node_id(), "a", "href").unwrap();
226        assert_eq!(hrefs, vec![Some("/1".to_string()), Some("/2".to_string())]);
227    }
228
229    #[test]
230    fn test_select_text_empty_element() {
231        let soup = Soup::parse("<div></div>");
232        let texts = select_text(soup.document(), "div").unwrap();
233        assert_eq!(texts, vec![""]);
234    }
235
236    #[test]
237    fn test_select_text_whitespace_preserved() {
238        let soup = Soup::parse("<span>  Hello  </span>");
239        let texts = select_text(soup.document(), "span").unwrap();
240        // Note: depends on whitespace handling in parser
241        assert!(!texts.is_empty());
242    }
243}
scrape_core/query/extraction.rs

scrape_core/query/
extraction.rs