scrape_core/query/
find.rs

1//! Find functions for locating elements in the DOM.
2//!
3//! This module provides functions to find elements using CSS selectors:
4//!
5//! - [`find`] - Find first matching element in the entire document
6//! - [`find_all`] - Find all matching elements in the entire document
7//! - [`find_within`] - Find first matching element within a subtree
8//! - [`find_all_within`] - Find all matching elements within a subtree
9
10use selectors::{context::SelectorCaches, parser::SelectorList};
11
12use super::{
13    error::QueryResult,
14    selector::{ScrapeSelector, matches_selector_with_caches, parse_selector},
15};
16use crate::dom::{Document, NodeId};
17
18/// Finds the first element matching a CSS selector.
19///
20/// # Errors
21///
22/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
23///
24/// # Examples
25///
26/// ```rust
27/// use scrape_core::{Html5everParser, Parser, query::find};
28///
29/// let parser = Html5everParser;
30/// let doc = parser.parse("<div><span class=\"item\">text</span></div>").unwrap();
31///
32/// let result = find(&doc, "span.item").unwrap();
33/// assert!(result.is_some());
34/// ```
35pub fn find(doc: &Document, selector: &str) -> QueryResult<Option<NodeId>> {
36    let selectors = parse_selector(selector)?;
37    Ok(find_with_selector(doc, &selectors))
38}
39
40/// Finds all elements matching a CSS selector.
41///
42/// # Errors
43///
44/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
45///
46/// # Examples
47///
48/// ```rust
49/// use scrape_core::{Html5everParser, Parser, query::find_all};
50///
51/// let parser = Html5everParser;
52/// let doc = parser.parse("<ul><li>A</li><li>B</li><li>C</li></ul>").unwrap();
53///
54/// let items = find_all(&doc, "li").unwrap();
55/// assert_eq!(items.len(), 3);
56/// ```
57pub fn find_all(doc: &Document, selector: &str) -> QueryResult<Vec<NodeId>> {
58    let selectors = parse_selector(selector)?;
59    Ok(find_all_with_selector(doc, &selectors))
60}
61
62/// Finds the first element matching a CSS selector within a subtree.
63///
64/// The search starts from the given scope node and only includes its descendants.
65///
66/// # Errors
67///
68/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
69///
70/// # Examples
71///
72/// ```rust
73/// use scrape_core::{Html5everParser, Parser, query::find_within};
74///
75/// let parser = Html5everParser;
76/// let doc = parser
77///     .parse("<div id=\"a\"><span>A</span></div><div id=\"b\"><span>B</span></div>")
78///     .unwrap();
79///
80/// // Find div#a first
81/// let scope = doc
82///     .nodes()
83///     .find(|(_, n)| n.kind.attributes().and_then(|a| a.get("id")) == Some(&"a".to_string()))
84///     .map(|(id, _)| id)
85///     .unwrap();
86///
87/// let result = find_within(&doc, scope, "span").unwrap();
88/// assert!(result.is_some());
89/// ```
90pub fn find_within(doc: &Document, scope: NodeId, selector: &str) -> QueryResult<Option<NodeId>> {
91    let selectors = parse_selector(selector)?;
92    Ok(find_within_with_selector(doc, scope, &selectors))
93}
94
95/// Finds all elements matching a CSS selector within a subtree.
96///
97/// # Errors
98///
99/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
100pub fn find_all_within(doc: &Document, scope: NodeId, selector: &str) -> QueryResult<Vec<NodeId>> {
101    let selectors = parse_selector(selector)?;
102    Ok(find_all_within_with_selector(doc, scope, &selectors))
103}
104
105/// Finds the first element matching a pre-parsed selector.
106///
107/// Use this for repeated queries with the same selector to avoid re-parsing.
108#[must_use]
109pub fn find_with_selector(
110    doc: &Document,
111    selectors: &SelectorList<ScrapeSelector>,
112) -> Option<NodeId> {
113    let root = doc.root()?;
114    let mut caches = SelectorCaches::default();
115
116    // Check root first
117    if matches_selector_with_caches(doc, root, selectors, &mut caches) {
118        return Some(root);
119    }
120
121    // Then check descendants
122    for id in doc.descendants(root) {
123        if let Some(node) = doc.get(id)
124            && node.kind.is_element()
125            && matches_selector_with_caches(doc, id, selectors, &mut caches)
126        {
127            return Some(id);
128        }
129    }
130
131    None
132}
133
134/// Finds all elements matching a pre-parsed selector.
135#[must_use]
136pub fn find_all_with_selector(
137    doc: &Document,
138    selectors: &SelectorList<ScrapeSelector>,
139) -> Vec<NodeId> {
140    let mut results = Vec::new();
141
142    let Some(root) = doc.root() else {
143        return results;
144    };
145
146    let mut caches = SelectorCaches::default();
147
148    // Check root first
149    if matches_selector_with_caches(doc, root, selectors, &mut caches) {
150        results.push(root);
151    }
152
153    // Then check descendants
154    for id in doc.descendants(root) {
155        if let Some(node) = doc.get(id)
156            && node.kind.is_element()
157            && matches_selector_with_caches(doc, id, selectors, &mut caches)
158        {
159            results.push(id);
160        }
161    }
162
163    results
164}
165
166/// Finds the first element matching a selector within a subtree.
167#[must_use]
168pub fn find_within_with_selector(
169    doc: &Document,
170    scope: NodeId,
171    selectors: &SelectorList<ScrapeSelector>,
172) -> Option<NodeId> {
173    let mut caches = SelectorCaches::default();
174
175    for id in doc.descendants(scope) {
176        if let Some(node) = doc.get(id)
177            && node.kind.is_element()
178            && matches_selector_with_caches(doc, id, selectors, &mut caches)
179        {
180            return Some(id);
181        }
182    }
183    None
184}
185
186/// Finds all elements matching a selector within a subtree.
187#[must_use]
188pub fn find_all_within_with_selector(
189    doc: &Document,
190    scope: NodeId,
191    selectors: &SelectorList<ScrapeSelector>,
192) -> Vec<NodeId> {
193    let mut results = Vec::new();
194    let mut caches = SelectorCaches::default();
195
196    for id in doc.descendants(scope) {
197        if let Some(node) = doc.get(id)
198            && node.kind.is_element()
199            && matches_selector_with_caches(doc, id, selectors, &mut caches)
200        {
201            results.push(id);
202        }
203    }
204
205    results
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211    use crate::parser::{Html5everParser, Parser};
212
213    fn parse_doc(html: &str) -> Document {
214        Html5everParser.parse(html).unwrap()
215    }
216
217    #[test]
218    fn test_find_by_tag() {
219        let doc = parse_doc("<div><span>text</span></div>");
220        let result = find(&doc, "span").unwrap();
221        assert!(result.is_some());
222
223        let span_id = result.unwrap();
224        assert_eq!(doc.get(span_id).unwrap().kind.tag_name(), Some("span"));
225    }
226
227    #[test]
228    fn test_find_by_class() {
229        let doc = parse_doc("<div class=\"container\"><span class=\"item\">text</span></div>");
230        let result = find(&doc, ".item").unwrap();
231        assert!(result.is_some());
232    }
233
234    #[test]
235    fn test_find_by_id() {
236        let doc = parse_doc("<div id=\"main\">text</div>");
237        let result = find(&doc, "#main").unwrap();
238        assert!(result.is_some());
239    }
240
241    #[test]
242    fn test_find_returns_none_when_not_found() {
243        let doc = parse_doc("<div>text</div>");
244        let result = find(&doc, "span").unwrap();
245        assert!(result.is_none());
246    }
247
248    #[test]
249    fn test_find_invalid_selector() {
250        let doc = parse_doc("<div>text</div>");
251        let result = find(&doc, "[");
252        assert!(result.is_err());
253    }
254
255    #[test]
256    fn test_find_all_by_tag() {
257        let doc = parse_doc("<ul><li>A</li><li>B</li><li>C</li></ul>");
258        let results = find_all(&doc, "li").unwrap();
259        assert_eq!(results.len(), 3);
260    }
261
262    #[test]
263    fn test_find_all_returns_empty_when_not_found() {
264        let doc = parse_doc("<div>text</div>");
265        let results = find_all(&doc, "span").unwrap();
266        assert!(results.is_empty());
267    }
268
269    #[test]
270    fn test_find_all_by_class() {
271        let doc =
272            parse_doc("<div class=\"a\">1</div><div class=\"b\">2</div><div class=\"a\">3</div>");
273        let results = find_all(&doc, ".a").unwrap();
274        assert_eq!(results.len(), 2);
275    }
276
277    #[test]
278    fn test_find_with_compound_selector() {
279        let doc =
280            parse_doc("<div class=\"foo\" id=\"bar\">match</div><div class=\"foo\">no id</div>");
281        let result = find(&doc, "div.foo#bar").unwrap();
282        assert!(result.is_some());
283    }
284
285    #[test]
286    fn test_find_with_descendant_combinator() {
287        let doc = parse_doc("<div><ul><li>item</li></ul></div>");
288        let result = find(&doc, "div li").unwrap();
289        assert!(result.is_some());
290    }
291
292    #[test]
293    fn test_find_with_child_combinator() {
294        let doc =
295            parse_doc("<div><span>direct</span></div><div><ul><span>nested</span></ul></div>");
296        let results = find_all(&doc, "div > span").unwrap();
297        assert_eq!(results.len(), 1);
298    }
299
300    #[test]
301    fn test_find_within_scope() {
302        let doc = parse_doc("<div id=\"a\"><span>A</span></div><div id=\"b\"><span>B</span></div>");
303
304        // Find div#b
305        let scope = doc
306            .nodes()
307            .find(|(_, n)| {
308                n.kind.attributes().and_then(|a| a.get("id")).is_some_and(|id| id == "b")
309            })
310            .map(|(id, _)| id)
311            .unwrap();
312
313        // Find span within div#b only
314        let result = find_within(&doc, scope, "span").unwrap();
315        assert!(result.is_some());
316
317        // Verify it's the correct span (child of scope)
318        let span_id = result.unwrap();
319        let span_parent = doc.parent(span_id).unwrap();
320        assert_eq!(span_parent, scope);
321    }
322
323    #[test]
324    fn test_find_all_within_scope() {
325        let doc = parse_doc("<ul id=\"list\"><li>1</li><li>2</li></ul><li>outside</li>");
326
327        // Find ul#list
328        let scope = doc
329            .nodes()
330            .find(|(_, n)| {
331                n.kind.attributes().and_then(|a| a.get("id")).is_some_and(|id| id == "list")
332            })
333            .map(|(id, _)| id)
334            .unwrap();
335
336        let results = find_all_within(&doc, scope, "li").unwrap();
337        assert_eq!(results.len(), 2); // Only li elements inside ul#list
338    }
339
340    #[test]
341    fn test_find_returns_first_match() {
342        let doc = parse_doc(
343            "<div class=\"item\" id=\"first\">1</div><div class=\"item\" id=\"second\">2</div>",
344        );
345        let result = find(&doc, ".item").unwrap();
346        assert!(result.is_some());
347
348        let id = result.unwrap();
349        let attrs = doc.get(id).unwrap().kind.attributes().unwrap();
350        assert_eq!(attrs.get("id"), Some(&"first".to_string()));
351    }
352
353    #[test]
354    fn test_find_all_preserves_order() {
355        let doc = parse_doc("<ul><li id=\"a\">A</li><li id=\"b\">B</li><li id=\"c\">C</li></ul>");
356        let results = find_all(&doc, "li").unwrap();
357
358        let ids: Vec<_> = results
359            .iter()
360            .map(|id| {
361                doc.get(*id).and_then(|n| n.kind.attributes()).and_then(|a| a.get("id").cloned())
362            })
363            .collect();
364
365        assert_eq!(ids, vec![Some("a".into()), Some("b".into()), Some("c".into())]);
366    }
367
368    #[test]
369    fn test_find_empty_document() {
370        let doc = Document::new();
371        let result = find(&doc, "div").unwrap();
372        assert!(result.is_none());
373    }
374
375    #[test]
376    fn test_find_all_empty_document() {
377        let doc = Document::new();
378        let results = find_all(&doc, "div").unwrap();
379        assert!(results.is_empty());
380    }
381
382    #[test]
383    fn test_find_with_attribute_selector() {
384        let doc = parse_doc("<input type=\"text\"><input type=\"password\">");
385        let result = find(&doc, "input[type=\"text\"]").unwrap();
386        assert!(result.is_some());
387    }
388
389    #[test]
390    fn test_find_all_multiple_selectors() {
391        let doc = parse_doc("<div>a</div><span>b</span><p>c</p>");
392        let results = find_all(&doc, "div, span").unwrap();
393        assert_eq!(results.len(), 2);
394    }
395
396    #[test]
397    fn test_find_universal_selector() {
398        let doc = parse_doc("<div><span>text</span></div>");
399        let results = find_all(&doc, "*").unwrap();
400        // Should match html, head, body, div, span (and possibly more from html5ever)
401        assert!(results.len() >= 2);
402    }
403}