Skip to main content

scrape_core/query/
find.rs

1//! Find functions for locating elements in the DOM.
2//!
3//! This module provides functions to find elements using CSS selectors:
4//!
5//! - [`find`] - Find first matching element in the entire document
6//! - [`find_all`] - Find all matching elements in the entire document
7//! - [`find_within`] - Find first matching element within a subtree
8//! - [`find_all_within`] - Find all matching elements within a subtree
9
10use selectors::{context::SelectorCaches, parser::SelectorList};
11
12use super::{
13    CompiledSelector,
14    error::QueryResult,
15    selector::{ScrapeSelector, matches_selector_with_caches, parse_selector},
16};
17use crate::dom::{Document, NodeId};
18
19/// Finds the first element matching a CSS selector.
20///
21/// # Errors
22///
23/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
24///
25/// # Examples
26///
27/// ```rust
28/// use scrape_core::{Html5everParser, Parser, query::find};
29///
30/// let parser = Html5everParser;
31/// let doc = parser.parse("<div><span class=\"item\">text</span></div>").unwrap();
32///
33/// let result = find(&doc, "span.item").unwrap();
34/// assert!(result.is_some());
35/// ```
36pub fn find(doc: &Document, selector: &str) -> QueryResult<Option<NodeId>> {
37    // Fast path: simple ID selector
38    if let Some(id) = selector.strip_prefix('#')
39        && is_simple_selector(id)
40        && let Some(index) = doc.index()
41    {
42        return Ok(index.get_by_id(id));
43    }
44
45    // Fast path: simple class selector
46    if let Some(class) = selector.strip_prefix('.')
47        && is_simple_selector(class)
48        && let Some(index) = doc.index()
49    {
50        return Ok(index.get_by_class(class).first().copied());
51    }
52
53    // Fall back to full selector matching
54    let selectors = parse_selector(selector)?;
55    Ok(find_with_selector(doc, &selectors))
56}
57
58/// Finds all elements matching a CSS selector.
59///
60/// # Errors
61///
62/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
63///
64/// # Examples
65///
66/// ```rust
67/// use scrape_core::{Html5everParser, Parser, query::find_all};
68///
69/// let parser = Html5everParser;
70/// let doc = parser.parse("<ul><li>A</li><li>B</li><li>C</li></ul>").unwrap();
71///
72/// let items = find_all(&doc, "li").unwrap();
73/// assert_eq!(items.len(), 3);
74/// ```
75pub fn find_all(doc: &Document, selector: &str) -> QueryResult<Vec<NodeId>> {
76    // Fast path: simple ID selector
77    if let Some(id) = selector.strip_prefix('#')
78        && is_simple_selector(id)
79        && let Some(index) = doc.index()
80    {
81        return Ok(index.get_by_id(id).into_iter().collect());
82    }
83
84    // Fast path: simple class selector
85    if let Some(class) = selector.strip_prefix('.')
86        && is_simple_selector(class)
87        && let Some(index) = doc.index()
88    {
89        return Ok(index.get_by_class(class).to_vec());
90    }
91
92    // Fall back to full selector matching
93    let selectors = parse_selector(selector)?;
94    Ok(find_all_with_selector(doc, &selectors))
95}
96
97/// Finds the first element matching a CSS selector within a subtree.
98///
99/// The search starts from the given scope node and only includes its descendants.
100///
101/// # Errors
102///
103/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
104///
105/// # Examples
106///
107/// ```rust
108/// use scrape_core::{Html5everParser, Parser, query::find_within};
109///
110/// let parser = Html5everParser;
111/// let doc = parser
112///     .parse("<div id=\"a\"><span>A</span></div><div id=\"b\"><span>B</span></div>")
113///     .unwrap();
114///
115/// // Find div#a first
116/// let scope = doc
117///     .nodes()
118///     .find(|(_, n)| n.kind.attributes().and_then(|a| a.get("id")) == Some(&"a".to_string()))
119///     .map(|(id, _)| id)
120///     .unwrap();
121///
122/// let result = find_within(&doc, scope, "span").unwrap();
123/// assert!(result.is_some());
124/// ```
125pub fn find_within(doc: &Document, scope: NodeId, selector: &str) -> QueryResult<Option<NodeId>> {
126    let selectors = parse_selector(selector)?;
127    Ok(find_within_with_selector(doc, scope, &selectors))
128}
129
130/// Finds all elements matching a CSS selector within a subtree.
131///
132/// # Errors
133///
134/// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
135pub fn find_all_within(doc: &Document, scope: NodeId, selector: &str) -> QueryResult<Vec<NodeId>> {
136    let selectors = parse_selector(selector)?;
137    Ok(find_all_within_with_selector(doc, scope, &selectors))
138}
139
140/// Finds the first element matching a pre-parsed selector.
141///
142/// Use this for repeated queries with the same selector to avoid re-parsing.
143#[must_use]
144pub fn find_with_selector(
145    doc: &Document,
146    selectors: &SelectorList<ScrapeSelector>,
147) -> Option<NodeId> {
148    let root = doc.root()?;
149    let mut caches = SelectorCaches::default();
150
151    // Check root first
152    if matches_selector_with_caches(doc, root, selectors, &mut caches) {
153        return Some(root);
154    }
155
156    // Then check descendants
157    for id in doc.descendants(root) {
158        if let Some(node) = doc.get(id)
159            && node.kind.is_element()
160            && matches_selector_with_caches(doc, id, selectors, &mut caches)
161        {
162            return Some(id);
163        }
164    }
165
166    None
167}
168
169/// Finds all elements matching a pre-parsed selector.
170#[must_use]
171pub fn find_all_with_selector(
172    doc: &Document,
173    selectors: &SelectorList<ScrapeSelector>,
174) -> Vec<NodeId> {
175    let mut results = Vec::new();
176
177    let Some(root) = doc.root() else {
178        return results;
179    };
180
181    let mut caches = SelectorCaches::default();
182
183    // Check root first
184    if matches_selector_with_caches(doc, root, selectors, &mut caches) {
185        results.push(root);
186    }
187
188    // Then check descendants
189    for id in doc.descendants(root) {
190        if let Some(node) = doc.get(id)
191            && node.kind.is_element()
192            && matches_selector_with_caches(doc, id, selectors, &mut caches)
193        {
194            results.push(id);
195        }
196    }
197
198    results
199}
200
201/// Finds the first element matching a selector within a subtree.
202#[must_use]
203pub fn find_within_with_selector(
204    doc: &Document,
205    scope: NodeId,
206    selectors: &SelectorList<ScrapeSelector>,
207) -> Option<NodeId> {
208    let mut caches = SelectorCaches::default();
209
210    for id in doc.descendants(scope) {
211        if let Some(node) = doc.get(id)
212            && node.kind.is_element()
213            && matches_selector_with_caches(doc, id, selectors, &mut caches)
214        {
215            return Some(id);
216        }
217    }
218    None
219}
220
221/// Finds all elements matching a selector within a subtree.
222#[must_use]
223pub fn find_all_within_with_selector(
224    doc: &Document,
225    scope: NodeId,
226    selectors: &SelectorList<ScrapeSelector>,
227) -> Vec<NodeId> {
228    let mut results = Vec::new();
229    let mut caches = SelectorCaches::default();
230
231    for id in doc.descendants(scope) {
232        if let Some(node) = doc.get(id)
233            && node.kind.is_element()
234            && matches_selector_with_caches(doc, id, selectors, &mut caches)
235        {
236            results.push(id);
237        }
238    }
239
240    results
241}
242
243/// Finds the first element matching a compiled selector.
244///
245/// # Examples
246///
247/// ```rust
248/// use scrape_core::{
249///     Html5everParser, Parser,
250///     query::{CompiledSelector, find_compiled},
251/// };
252///
253/// let parser = Html5everParser;
254/// let doc = parser.parse("<div><span class=\"item\">text</span></div>").unwrap();
255/// let selector = CompiledSelector::compile("span.item").unwrap();
256///
257/// let result = find_compiled(&doc, &selector);
258/// assert!(result.is_some());
259/// ```
260#[must_use]
261pub fn find_compiled(doc: &Document, selector: &CompiledSelector) -> Option<NodeId> {
262    find_with_selector(doc, selector.selector_list())
263}
264
265/// Finds all elements matching a compiled selector.
266///
267/// # Examples
268///
269/// ```rust
270/// use scrape_core::{
271///     Html5everParser, Parser,
272///     query::{CompiledSelector, find_all_compiled},
273/// };
274///
275/// let parser = Html5everParser;
276/// let doc = parser.parse("<ul><li>A</li><li>B</li><li>C</li></ul>").unwrap();
277/// let selector = CompiledSelector::compile("li").unwrap();
278///
279/// let items = find_all_compiled(&doc, &selector);
280/// assert_eq!(items.len(), 3);
281/// ```
282#[must_use]
283pub fn find_all_compiled(doc: &Document, selector: &CompiledSelector) -> Vec<NodeId> {
284    find_all_with_selector(doc, selector.selector_list())
285}
286
287/// Finds the first element matching a compiled selector within a subtree.
288///
289/// # Examples
290///
291/// ```rust
292/// use scrape_core::{
293///     Html5everParser, Parser,
294///     query::{CompiledSelector, find_within_compiled},
295/// };
296///
297/// let parser = Html5everParser;
298/// let doc = parser
299///     .parse("<div id=\"a\"><span>A</span></div><div id=\"b\"><span>B</span></div>")
300///     .unwrap();
301/// let selector = CompiledSelector::compile("span").unwrap();
302///
303/// // Find div#a first
304/// let scope = doc
305///     .nodes()
306///     .find(|(_, n)| n.kind.attributes().and_then(|a| a.get("id")) == Some(&"a".to_string()))
307///     .map(|(id, _)| id)
308///     .unwrap();
309///
310/// let result = find_within_compiled(&doc, scope, &selector);
311/// assert!(result.is_some());
312/// ```
313#[must_use]
314pub fn find_within_compiled(
315    doc: &Document,
316    scope: NodeId,
317    selector: &CompiledSelector,
318) -> Option<NodeId> {
319    find_within_with_selector(doc, scope, selector.selector_list())
320}
321
322/// Finds all elements matching a compiled selector within a subtree.
323#[must_use]
324pub fn find_all_within_compiled(
325    doc: &Document,
326    scope: NodeId,
327    selector: &CompiledSelector,
328) -> Vec<NodeId> {
329    find_all_within_with_selector(doc, scope, selector.selector_list())
330}
331
332/// Checks if a selector string is simple (no combinators or complex syntax).
333///
334/// A simple selector is one that contains only alphanumeric characters, hyphens,
335/// and underscores. It does not contain combinators (>, +, ~, space), attribute
336/// selectors, pseudo-classes, or multiple selectors.
337#[inline]
338fn is_simple_selector(s: &str) -> bool {
339    !s.is_empty() && !s.contains(['.', '#', '[', ']', ':', ' ', '>', '+', '~', ',', '*', '(', ')'])
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345    use crate::parser::{Html5everParser, Parser};
346
347    fn parse_doc(html: &str) -> Document {
348        Html5everParser.parse(html).unwrap()
349    }
350
351    #[test]
352    fn test_find_by_tag() {
353        let doc = parse_doc("<div><span>text</span></div>");
354        let result = find(&doc, "span").unwrap();
355        assert!(result.is_some());
356
357        let span_id = result.unwrap();
358        assert_eq!(doc.get(span_id).unwrap().kind.tag_name(), Some("span"));
359    }
360
361    #[test]
362    fn test_find_by_class() {
363        let doc = parse_doc("<div class=\"container\"><span class=\"item\">text</span></div>");
364        let result = find(&doc, ".item").unwrap();
365        assert!(result.is_some());
366    }
367
368    #[test]
369    fn test_find_by_id() {
370        let doc = parse_doc("<div id=\"main\">text</div>");
371        let result = find(&doc, "#main").unwrap();
372        assert!(result.is_some());
373    }
374
375    #[test]
376    fn test_find_returns_none_when_not_found() {
377        let doc = parse_doc("<div>text</div>");
378        let result = find(&doc, "span").unwrap();
379        assert!(result.is_none());
380    }
381
382    #[test]
383    fn test_find_invalid_selector() {
384        let doc = parse_doc("<div>text</div>");
385        let result = find(&doc, "[");
386        assert!(result.is_err());
387    }
388
389    #[test]
390    fn test_find_all_by_tag() {
391        let doc = parse_doc("<ul><li>A</li><li>B</li><li>C</li></ul>");
392        let results = find_all(&doc, "li").unwrap();
393        assert_eq!(results.len(), 3);
394    }
395
396    #[test]
397    fn test_find_all_returns_empty_when_not_found() {
398        let doc = parse_doc("<div>text</div>");
399        let results = find_all(&doc, "span").unwrap();
400        assert!(results.is_empty());
401    }
402
403    #[test]
404    fn test_find_all_by_class() {
405        let doc =
406            parse_doc("<div class=\"a\">1</div><div class=\"b\">2</div><div class=\"a\">3</div>");
407        let results = find_all(&doc, ".a").unwrap();
408        assert_eq!(results.len(), 2);
409    }
410
411    #[test]
412    fn test_find_with_compound_selector() {
413        let doc =
414            parse_doc("<div class=\"foo\" id=\"bar\">match</div><div class=\"foo\">no id</div>");
415        let result = find(&doc, "div.foo#bar").unwrap();
416        assert!(result.is_some());
417    }
418
419    #[test]
420    fn test_find_with_descendant_combinator() {
421        let doc = parse_doc("<div><ul><li>item</li></ul></div>");
422        let result = find(&doc, "div li").unwrap();
423        assert!(result.is_some());
424    }
425
426    #[test]
427    fn test_find_with_child_combinator() {
428        let doc =
429            parse_doc("<div><span>direct</span></div><div><ul><span>nested</span></ul></div>");
430        let results = find_all(&doc, "div > span").unwrap();
431        assert_eq!(results.len(), 1);
432    }
433
434    #[test]
435    fn test_find_within_scope() {
436        let doc = parse_doc("<div id=\"a\"><span>A</span></div><div id=\"b\"><span>B</span></div>");
437
438        // Find div#b
439        let scope = doc
440            .nodes()
441            .find(|(_, n)| {
442                n.kind.attributes().and_then(|a| a.get("id")).is_some_and(|id| id == "b")
443            })
444            .map(|(id, _)| id)
445            .unwrap();
446
447        // Find span within div#b only
448        let result = find_within(&doc, scope, "span").unwrap();
449        assert!(result.is_some());
450
451        // Verify it's the correct span (child of scope)
452        let span_id = result.unwrap();
453        let span_parent = doc.parent(span_id).unwrap();
454        assert_eq!(span_parent, scope);
455    }
456
457    #[test]
458    fn test_find_all_within_scope() {
459        let doc = parse_doc("<ul id=\"list\"><li>1</li><li>2</li></ul><li>outside</li>");
460
461        // Find ul#list
462        let scope = doc
463            .nodes()
464            .find(|(_, n)| {
465                n.kind.attributes().and_then(|a| a.get("id")).is_some_and(|id| id == "list")
466            })
467            .map(|(id, _)| id)
468            .unwrap();
469
470        let results = find_all_within(&doc, scope, "li").unwrap();
471        assert_eq!(results.len(), 2); // Only li elements inside ul#list
472    }
473
474    #[test]
475    fn test_find_returns_first_match() {
476        let doc = parse_doc(
477            "<div class=\"item\" id=\"first\">1</div><div class=\"item\" id=\"second\">2</div>",
478        );
479        let result = find(&doc, ".item").unwrap();
480        assert!(result.is_some());
481
482        let id = result.unwrap();
483        let attrs = doc.get(id).unwrap().kind.attributes().unwrap();
484        assert_eq!(attrs.get("id"), Some(&"first".to_string()));
485    }
486
487    #[test]
488    fn test_find_all_preserves_order() {
489        let doc = parse_doc("<ul><li id=\"a\">A</li><li id=\"b\">B</li><li id=\"c\">C</li></ul>");
490        let results = find_all(&doc, "li").unwrap();
491
492        let ids: Vec<_> = results
493            .iter()
494            .map(|id| {
495                doc.get(*id).and_then(|n| n.kind.attributes()).and_then(|a| a.get("id").cloned())
496            })
497            .collect();
498
499        assert_eq!(ids, vec![Some("a".into()), Some("b".into()), Some("c".into())]);
500    }
501
502    #[test]
503    fn test_find_empty_document() {
504        let doc = Document::new();
505        let result = find(&doc, "div").unwrap();
506        assert!(result.is_none());
507    }
508
509    #[test]
510    fn test_find_all_empty_document() {
511        let doc = Document::new();
512        let results = find_all(&doc, "div").unwrap();
513        assert!(results.is_empty());
514    }
515
516    #[test]
517    fn test_find_with_attribute_selector() {
518        let doc = parse_doc("<input type=\"text\"><input type=\"password\">");
519        let result = find(&doc, "input[type=\"text\"]").unwrap();
520        assert!(result.is_some());
521    }
522
523    #[test]
524    fn test_find_all_multiple_selectors() {
525        let doc = parse_doc("<div>a</div><span>b</span><p>c</p>");
526        let results = find_all(&doc, "div, span").unwrap();
527        assert_eq!(results.len(), 2);
528    }
529
530    #[test]
531    fn test_find_universal_selector() {
532        let doc = parse_doc("<div><span>text</span></div>");
533        let results = find_all(&doc, "*").unwrap();
534        // Should match html, head, body, div, span (and possibly more from html5ever)
535        assert!(results.len() >= 2);
536    }
537
538    #[test]
539    fn test_is_simple_selector() {
540        assert!(is_simple_selector("main"));
541        assert!(is_simple_selector("my-id"));
542        assert!(is_simple_selector("my_class"));
543        assert!(is_simple_selector("id123"));
544
545        assert!(!is_simple_selector(""));
546        assert!(!is_simple_selector("foo bar"));
547        assert!(!is_simple_selector("foo.bar"));
548        assert!(!is_simple_selector("foo#bar"));
549        assert!(!is_simple_selector("foo[attr]"));
550        assert!(!is_simple_selector("foo:hover"));
551        assert!(!is_simple_selector("foo>bar"));
552        assert!(!is_simple_selector("foo+bar"));
553        assert!(!is_simple_selector("foo~bar"));
554        assert!(!is_simple_selector("foo,bar"));
555        assert!(!is_simple_selector("*"));
556        assert!(!is_simple_selector("foo(bar)"));
557    }
558
559    #[test]
560    fn test_fast_path_id_selector() {
561        let doc = parse_doc("<div id='main'><span id='inner'>text</span></div>");
562
563        let main = find(&doc, "#main").unwrap();
564        assert!(main.is_some());
565        let main_id = main.unwrap();
566        assert_eq!(doc.get(main_id).unwrap().kind.tag_name(), Some("div"));
567
568        let inner = find(&doc, "#inner").unwrap();
569        assert!(inner.is_some());
570    }
571
572    #[test]
573    fn test_fast_path_class_selector_find() {
574        let doc = parse_doc("<div class='item'>A</div><div class='item'>B</div>");
575
576        let first = find(&doc, ".item").unwrap();
577        assert!(first.is_some());
578    }
579
580    #[test]
581    fn test_fast_path_class_selector_find_all() {
582        let doc = parse_doc(
583            "<div class='item'>A</div><div class='item'>B</div><div class='item'>C</div>",
584        );
585
586        let items = find_all(&doc, ".item").unwrap();
587        assert_eq!(items.len(), 3);
588    }
589
590    #[test]
591    fn test_fast_path_id_not_found() {
592        let doc = parse_doc("<div id='main'>text</div>");
593
594        let result = find(&doc, "#notfound").unwrap();
595        assert!(result.is_none());
596    }
597
598    #[test]
599    fn test_fast_path_class_not_found() {
600        let doc = parse_doc("<div class='foo'>text</div>");
601
602        let results = find_all(&doc, ".notfound").unwrap();
603        assert!(results.is_empty());
604    }
605
606    #[test]
607    fn test_complex_selector_fallback() {
608        let doc = parse_doc("<div id='main' class='container'>text</div>");
609
610        let result = find(&doc, "#main.container").unwrap();
611        assert!(result.is_some());
612
613        let result = find(&doc, "div#main").unwrap();
614        assert!(result.is_some());
615
616        let result = find(&doc, "div > #main").unwrap();
617        assert!(result.is_none());
618    }
619
620    #[test]
621    fn test_fast_path_duplicate_ids() {
622        let doc = parse_doc("<div id='dup'>First</div><div id='dup'>Second</div>");
623
624        let found = find(&doc, "#dup").unwrap();
625        assert!(found.is_some());
626    }
627
628    #[test]
629    fn test_fast_path_multiple_classes() {
630        let doc = parse_doc("<div class='foo bar'>A</div><div class='bar baz'>B</div>");
631
632        let items = find_all(&doc, ".bar").unwrap();
633        assert_eq!(items.len(), 2);
634    }
635
636    #[test]
637    fn test_fast_path_with_no_index() {
638        let mut doc = Document::new();
639        #[allow(clippy::default_trait_access)]
640        let root_id = doc.create_element("html".to_string(), Default::default());
641        doc.set_root(root_id);
642        #[allow(clippy::default_trait_access)]
643        let elem = doc.create_element("div".to_string(), Default::default());
644        doc.append_child(root_id, elem);
645
646        let result = find(&doc, "#test").unwrap();
647        assert!(result.is_none());
648
649        let results = find_all(&doc, ".test").unwrap();
650        assert!(results.is_empty());
651    }
652
653    #[test]
654    fn test_fast_path_unicode_selectors() {
655        let doc = parse_doc("<div id='日本語'>Japanese</div><div class='中文'>Chinese</div>");
656
657        let result = find(&doc, "#日本語").unwrap();
658        assert!(result.is_some());
659
660        let results = find_all(&doc, ".中文").unwrap();
661        assert_eq!(results.len(), 1);
662    }
663
664    #[test]
665    fn test_fast_path_very_long_selector() {
666        let long_id = "a".repeat(1000);
667        let html = format!("<div id='{long_id}'>text</div>");
668        let doc = parse_doc(&html);
669
670        let result = find(&doc, &format!("#{long_id}")).unwrap();
671        assert!(result.is_some());
672    }
673
674    #[test]
675    fn test_fast_path_empty_class_attribute() {
676        let doc = parse_doc("<div class=''>Empty</div><div>No class</div>");
677
678        let results = find_all(&doc, ".foo").unwrap();
679        assert!(results.is_empty());
680    }
681
682    #[test]
683    fn test_fast_path_special_chars_in_selector() {
684        let doc = parse_doc("<div id='test:id'>Colon</div><div class='foo.bar'>Dot</div>");
685
686        let result = find(&doc, "#test\\:id").unwrap();
687        assert!(result.is_some());
688
689        let results = find_all(&doc, ".foo\\.bar").unwrap();
690        assert_eq!(results.len(), 1);
691    }
692
693    #[test]
694    fn test_fast_path_vs_fallback_consistency() {
695        let doc =
696            parse_doc("<div id='main' class='container'>A</div><div class='container'>B</div>");
697
698        let fast_result = find(&doc, "#main").unwrap();
699        let fallback_result = find(&doc, "[id='main']").unwrap();
700        assert_eq!(fast_result, fallback_result);
701
702        let fast_results = find_all(&doc, ".container").unwrap();
703        let fallback_results = find_all(&doc, "[class~='container']").unwrap();
704        assert_eq!(fast_results.len(), fallback_results.len());
705    }
706}