Skip to main content

fhp_selector/
lib.rs

1//! CSS selector and XPath engine for the SIMD-optimized HTML parser.
2//!
3//! Provides CSS selector parsing, XPath evaluation, and a convenience API
4//! for querying a parsed [`fhp_tree::Document`].
5//!
6//! # Quick Start — CSS
7//!
8//! ```
9//! use fhp_tree::parse;
10//! use fhp_selector::Selectable;
11//!
12//! let doc = parse("<div><p class=\"intro\">Hello</p></div>").unwrap();
13//! let sel = doc.select("p.intro").unwrap();
14//! assert_eq!(sel.len(), 1);
15//! assert_eq!(sel.text(), "Hello");
16//! ```
17//!
18//! # Quick Start — XPath
19//!
20//! ```
21//! use fhp_tree::parse;
22//! use fhp_selector::Selectable;
23//! use fhp_selector::xpath::ast::XPathResult;
24//!
25//! let doc = parse("<div><p>Hello</p></div>").unwrap();
26//! let result = doc.xpath("//p/text()").unwrap();
27//! match result {
28//!     XPathResult::Strings(texts) => assert_eq!(texts[0], "Hello"),
29//!     _ => panic!("expected strings"),
30//! }
31//! ```
32//!
33//! # Supported CSS Selectors
34//!
35//! - Type: `div`, `p`, `span`
36//! - Class: `.class`
37//! - ID: `#id`
38//! - Universal: `*`
39//! - Attribute: `[attr]`, `[attr=val]`, `[attr~=val]`, `[attr^=val]`, `[attr$=val]`, `[attr*=val]`
40//! - Pseudo: `:first-child`, `:last-child`, `:nth-child(an+b)`, `:not(sel)`
41//! - Compound: `div.class#id[attr]`
42//! - Combinator: `A B`, `A > B`, `A + B`, `A ~ B`
43//! - Comma list: `div, span`
44//!
45//! # Supported XPath
46//!
47//! - `//tag` — descendant search
48//! - `//tag[@attr='value']` — attribute predicate
49//! - `/path/to/tag` — absolute path
50//! - `//tag[contains(@attr, 'substr')]` — contains predicate
51//! - `//tag[position()=N]` — position predicate
52//! - `//tag/text()` — text extraction
53//! - `..` — parent axis
54
55/// CSS selector AST types.
56pub mod ast;
57/// Bloom filter for ancestor pre-filtering.
58pub mod bloom;
59/// Right-to-left matching engine.
60pub mod matcher;
61/// CSS selector parser.
62pub mod parser;
63/// XPath expression support.
64pub mod xpath;
65
66use std::cell::RefCell;
67use std::collections::{HashMap, VecDeque};
68use std::sync::Arc;
69
70use fhp_core::error::{SelectorError, XPathError};
71use fhp_core::tag::Tag;
72use fhp_tree::node::{NodeFlags, NodeId};
73use fhp_tree::{Document, NodeRef};
74
75use matcher::{select_all_list, select_first_list};
76use parser::parse_selector;
77use xpath::ast::XPathResult;
78
79#[inline]
80fn is_document_element(n: &fhp_tree::node::Node) -> bool {
81    n.depth > 0
82        && !n.flags.has(NodeFlags::IS_TEXT)
83        && !n.flags.has(NodeFlags::IS_COMMENT)
84        && !n.flags.has(NodeFlags::IS_DOCTYPE)
85}
86
87/// A pre-compiled CSS selector for reuse across documents and threads.
88///
89/// Parsing a CSS selector string has non-trivial cost. When the same selector
90/// is used to query many documents (e.g., in a scraping loop), compile it once
91/// and reuse it to eliminate repeated parse overhead.
92///
93/// # Example
94///
95/// ```
96/// use fhp_tree::parse;
97/// use fhp_selector::{CompiledSelector, Selectable};
98///
99/// let sel = CompiledSelector::new("div.content").unwrap();
100/// let doc = parse("<div class=\"content\">Hello</div>").unwrap();
101/// let results = doc.select_compiled(&sel).unwrap();
102/// assert_eq!(results.len(), 1);
103/// ```
104#[derive(Clone)]
105pub struct CompiledSelector {
106    list: Arc<ast::SelectorList>,
107}
108
109impl CompiledSelector {
110    /// Compile a CSS selector string.
111    ///
112    /// # Errors
113    ///
114    /// Returns [`SelectorError::Invalid`] if the selector syntax is invalid.
115    pub fn new(css: &str) -> Result<Self, SelectorError> {
116        Ok(Self {
117            list: Arc::new(parse_selector(css)?),
118        })
119    }
120
121    /// Access the underlying parsed selector list.
122    pub fn as_list(&self) -> &ast::SelectorList {
123        &self.list
124    }
125}
126
127impl core::fmt::Debug for CompiledSelector {
128    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
129        f.debug_struct("CompiledSelector")
130            .field("selectors", &self.list.selectors.len())
131            .finish()
132    }
133}
134
135/// Maximum number of parsed selector ASTs cached per thread.
136const SELECTOR_CACHE_CAPACITY: usize = 256;
137/// Skip caching unusually long selectors to avoid pinning large keys.
138const MAX_CACHED_SELECTOR_LEN: usize = 512;
139
140struct SelectorCache {
141    map: HashMap<String, Arc<ast::SelectorList>>,
142    order: VecDeque<String>,
143}
144
145impl SelectorCache {
146    fn new() -> Self {
147        Self {
148            map: HashMap::with_capacity(SELECTOR_CACHE_CAPACITY),
149            order: VecDeque::with_capacity(SELECTOR_CACHE_CAPACITY),
150        }
151    }
152
153    fn get(&self, css: &str) -> Option<Arc<ast::SelectorList>> {
154        self.map.get(css).map(Arc::clone)
155    }
156
157    fn insert(&mut self, css: &str, list: Arc<ast::SelectorList>) {
158        if self.map.contains_key(css) {
159            self.map.insert(css.to_owned(), list);
160            return;
161        }
162
163        if self.map.len() >= SELECTOR_CACHE_CAPACITY {
164            if let Some(old_key) = self.order.pop_front() {
165                self.map.remove(&old_key);
166            }
167        }
168
169        let key = css.to_owned();
170        self.order.push_back(key.clone());
171        self.map.insert(key, list);
172    }
173}
174
175thread_local! {
176    static SELECTOR_CACHE: RefCell<SelectorCache> = RefCell::new(SelectorCache::new());
177}
178
179#[inline]
180fn parse_selector_cached(css: &str) -> Result<Arc<ast::SelectorList>, SelectorError> {
181    if css.len() > MAX_CACHED_SELECTOR_LEN {
182        return Ok(Arc::new(parse_selector(css)?));
183    }
184
185    SELECTOR_CACHE.with(|cache| {
186        if let Some(list) = cache.borrow().get(css) {
187            return Ok(list);
188        }
189
190        let parsed = Arc::new(parse_selector(css)?);
191        cache.borrow_mut().insert(css, Arc::clone(&parsed));
192        Ok(parsed)
193    })
194}
195
196/// Merge results from multiple roots, deduplicating by NodeId index.
197///
198/// DFS results are already in document order (ascending NodeId index), so
199/// we merge sorted lists instead of using a HashSet. O(n) vs O(n log n).
200fn merge_dedup_results(
201    _arena: &fhp_tree::arena::Arena,
202    roots: &[NodeId],
203    mut query: impl FnMut(NodeId) -> Vec<NodeId>,
204) -> Vec<NodeId> {
205    let mut results = Vec::new();
206    let mut max_seen = u32::MAX; // tracks highest NodeId index seen
207    for &root in roots {
208        for id in query(root) {
209            let idx = id.index() as u32;
210            if results.is_empty() || idx > max_seen {
211                max_seen = idx;
212                results.push(id);
213            } else if idx != max_seen {
214                // Out-of-order (different subtree) — insert if not duplicate.
215                // For typical DFS, this branch is rare.
216                if !results.contains(&id) {
217                    results.push(id);
218                }
219            }
220        }
221    }
222    results
223}
224
225/// A collection of matched nodes from a selector query.
226///
227/// Provides iteration, text extraction, attribute access, and
228/// sub-selection (chaining).
229pub struct Selection<'a> {
230    doc: &'a Document,
231    nodes: Vec<NodeId>,
232}
233
234impl<'a> Selection<'a> {
235    /// Create a new selection from a document and node list.
236    fn new(doc: &'a Document, nodes: Vec<NodeId>) -> Self {
237        Self { doc, nodes }
238    }
239
240    /// Get the first matched node.
241    pub fn first(&self) -> Option<NodeRef<'a>> {
242        self.nodes.first().map(|&id| self.doc.get(id))
243    }
244
245    /// Iterate over matched nodes as [`NodeRef`].
246    pub fn iter(&self) -> impl Iterator<Item = NodeRef<'a>> + '_ {
247        self.nodes.iter().map(|&id| self.doc.get(id))
248    }
249
250    /// Iterate over matched node ids.
251    pub fn node_ids(&self) -> &[NodeId] {
252        &self.nodes
253    }
254
255    /// Collect text content from all matched nodes.
256    pub fn text(&self) -> String {
257        self.iter()
258            .map(|n| n.text_content())
259            .collect::<Vec<_>>()
260            .join("")
261    }
262
263    /// Get an attribute value from the first matched node.
264    pub fn attr(&self, name: &str) -> Option<&'a str> {
265        self.first()?.attr(name)
266    }
267
268    /// Get inner HTML from the first matched node.
269    pub fn inner_html(&self) -> String {
270        self.first().map(|n| n.inner_html()).unwrap_or_default()
271    }
272
273    /// Number of matched nodes.
274    pub fn len(&self) -> usize {
275        self.nodes.len()
276    }
277
278    /// Whether the selection is empty.
279    pub fn is_empty(&self) -> bool {
280        self.nodes.is_empty()
281    }
282
283    /// Sub-select with a pre-compiled selector within the matched nodes.
284    ///
285    /// Each matched node is used as a subtree root, and results are
286    /// deduplicated in document order.
287    pub fn select_compiled(&self, sel: &CompiledSelector) -> Result<Selection<'a>, SelectorError> {
288        let list = &sel.list;
289        if self.nodes.len() == 1 {
290            let results = select_all_list(self.doc.arena(), self.nodes[0], list);
291            return Ok(Selection::new(self.doc, results));
292        }
293        let results = merge_dedup_results(self.doc.arena(), &self.nodes, |root| {
294            select_all_list(self.doc.arena(), root, list)
295        });
296        Ok(Selection::new(self.doc, results))
297    }
298
299    /// Sub-select: run a CSS selector within the matched nodes.
300    ///
301    /// Each matched node is used as a subtree root, and results are
302    /// deduplicated in document order.
303    pub fn select(&self, css: &str) -> Result<Selection<'a>, SelectorError> {
304        let list = parse_selector_cached(css)?;
305        if self.nodes.len() == 1 {
306            // Single root — DFS produces document-order results, no duplicates possible.
307            let results = select_all_list(self.doc.arena(), self.nodes[0], &list);
308            return Ok(Selection::new(self.doc, results));
309        }
310        let results = merge_dedup_results(self.doc.arena(), &self.nodes, |root| {
311            select_all_list(self.doc.arena(), root, &list)
312        });
313        Ok(Selection::new(self.doc, results))
314    }
315
316    /// Evaluate an XPath expression within the matched nodes.
317    ///
318    /// Each matched node is used as a context root, and results are
319    /// deduplicated in document order.
320    pub fn xpath(&self, expr: &str) -> Result<XPathResult, XPathError> {
321        let parsed = xpath::parser::parse_xpath(expr)?;
322        let mut all_nodes = Vec::new();
323        let mut all_strings = Vec::new();
324        let mut seen = std::collections::HashSet::new();
325
326        for &node_id in &self.nodes {
327            let result = xpath::eval::evaluate(&parsed, self.doc.arena(), node_id);
328            match result {
329                XPathResult::Nodes(nodes) => {
330                    for id in nodes {
331                        if seen.insert(id) {
332                            all_nodes.push(id);
333                        }
334                    }
335                }
336                XPathResult::Strings(strings) => {
337                    all_strings.extend(strings);
338                }
339                XPathResult::Boolean(b) => return Ok(XPathResult::Boolean(b)),
340            }
341        }
342
343        if !all_strings.is_empty() {
344            Ok(XPathResult::Strings(all_strings))
345        } else {
346            Ok(XPathResult::Nodes(all_nodes))
347        }
348    }
349}
350
351impl<'a> IntoIterator for &'a Selection<'a> {
352    type Item = NodeRef<'a>;
353    type IntoIter = SelectionIter<'a>;
354
355    fn into_iter(self) -> Self::IntoIter {
356        SelectionIter {
357            doc: self.doc,
358            inner: self.nodes.iter(),
359        }
360    }
361}
362
363/// Iterator over [`Selection`] results.
364pub struct SelectionIter<'a> {
365    doc: &'a Document,
366    inner: std::slice::Iter<'a, NodeId>,
367}
368
369impl<'a> Iterator for SelectionIter<'a> {
370    type Item = NodeRef<'a>;
371
372    fn next(&mut self) -> Option<Self::Item> {
373        self.inner.next().map(|&id| self.doc.get(id))
374    }
375
376    fn size_hint(&self) -> (usize, Option<usize>) {
377        self.inner.size_hint()
378    }
379}
380
381impl<'a> ExactSizeIterator for SelectionIter<'a> {}
382
383/// Extension trait that adds CSS selector methods to [`Document`].
384///
385/// Import this trait to use `.select()` and convenience methods on a document.
386pub trait Selectable {
387    /// Select all nodes matching a CSS selector.
388    ///
389    /// # Errors
390    ///
391    /// Returns [`SelectorError::Invalid`] if the selector syntax is invalid.
392    ///
393    /// # Example
394    ///
395    /// ```
396    /// use fhp_tree::parse;
397    /// use fhp_selector::Selectable;
398    ///
399    /// let doc = parse("<div><p>Hello</p></div>").unwrap();
400    /// let sel = doc.select("p").unwrap();
401    /// assert_eq!(sel.len(), 1);
402    /// ```
403    fn select(&self, css: &str) -> Result<Selection<'_>, SelectorError>;
404
405    /// Select all nodes matching a pre-compiled CSS selector.
406    ///
407    /// This avoids re-parsing the selector string on every call, which is
408    /// beneficial when the same selector is reused across many documents.
409    fn select_compiled(&self, sel: &CompiledSelector) -> Result<Selection<'_>, SelectorError>;
410
411    /// Select the first node matching a pre-compiled CSS selector.
412    fn select_first_compiled(
413        &self,
414        sel: &CompiledSelector,
415    ) -> Result<Option<NodeRef<'_>>, SelectorError>;
416
417    /// Select the first node matching a CSS selector.
418    fn select_first(&self, css: &str) -> Result<Option<NodeRef<'_>>, SelectorError>;
419
420    /// Find all elements with the given tag.
421    fn find_by_tag(&self, tag: Tag) -> Selection<'_>;
422
423    /// Find an element by its `id` attribute.
424    ///
425    /// Scans all nodes linearly. For repeated lookups, build a
426    /// [`DocumentIndex`] instead.
427    fn find_by_id(&self, id: &str) -> Option<NodeRef<'_>>;
428
429    /// Find all elements with the given CSS class.
430    fn find_by_class(&self, class: &str) -> Selection<'_>;
431
432    /// Find all elements with an attribute matching a value.
433    fn find_by_attr(&self, name: &str, value: &str) -> Selection<'_>;
434
435    /// Evaluate an XPath expression against the document.
436    ///
437    /// # Errors
438    ///
439    /// Returns [`XPathError::Invalid`] if the expression syntax is invalid.
440    ///
441    /// # Example
442    ///
443    /// ```
444    /// use fhp_tree::parse;
445    /// use fhp_selector::Selectable;
446    /// use fhp_selector::xpath::ast::XPathResult;
447    ///
448    /// let doc = parse("<div><p>Hello</p></div>").unwrap();
449    /// let result = doc.xpath("//p").unwrap();
450    /// match result {
451    ///     XPathResult::Nodes(nodes) => assert_eq!(nodes.len(), 1),
452    ///     _ => panic!("expected nodes"),
453    /// }
454    /// ```
455    fn xpath(&self, expr: &str) -> Result<XPathResult, XPathError>;
456}
457
458impl Selectable for Document {
459    fn select(&self, css: &str) -> Result<Selection<'_>, SelectorError> {
460        let list = parse_selector_cached(css)?;
461        let nodes = select_all_list(self.arena(), self.root_id(), &list);
462        Ok(Selection::new(self, nodes))
463    }
464
465    fn select_compiled(&self, sel: &CompiledSelector) -> Result<Selection<'_>, SelectorError> {
466        let nodes = select_all_list(self.arena(), self.root_id(), &sel.list);
467        Ok(Selection::new(self, nodes))
468    }
469
470    fn select_first_compiled(
471        &self,
472        sel: &CompiledSelector,
473    ) -> Result<Option<NodeRef<'_>>, SelectorError> {
474        let node = select_first_list(self.arena(), self.root_id(), &sel.list);
475        Ok(node.map(|id| self.get(id)))
476    }
477
478    fn select_first(&self, css: &str) -> Result<Option<NodeRef<'_>>, SelectorError> {
479        let list = parse_selector_cached(css)?;
480        let node = select_first_list(self.arena(), self.root_id(), &list);
481        Ok(node.map(|id| self.get(id)))
482    }
483
484    fn find_by_tag(&self, tag: Tag) -> Selection<'_> {
485        let arena = self.arena();
486        let mut nodes = Vec::new();
487        for i in 0..arena.len() {
488            let id = NodeId(i as u32);
489            let n = arena.get(id);
490            if n.tag == tag && is_document_element(n) {
491                nodes.push(id);
492            }
493        }
494        Selection::new(self, nodes)
495    }
496
497    fn find_by_id(&self, id: &str) -> Option<NodeRef<'_>> {
498        let arena = self.arena();
499        for i in 0..arena.len() {
500            let node_id = NodeId(i as u32);
501            let attrs = arena.attrs(node_id);
502            for attr in attrs {
503                if arena.attr_name(attr).eq_ignore_ascii_case("id")
504                    && arena.attr_value(attr) == Some(id)
505                {
506                    return Some(self.get(node_id));
507                }
508            }
509        }
510        None
511    }
512
513    fn find_by_class(&self, class: &str) -> Selection<'_> {
514        let arena = self.arena();
515        let mut nodes = Vec::new();
516        for i in 0..arena.len() {
517            let id = NodeId(i as u32);
518            let attrs = arena.attrs(id);
519            for attr in attrs {
520                if arena.attr_name(attr).eq_ignore_ascii_case("class") {
521                    if let Some(val) = arena.attr_value(attr) {
522                        if val.split_whitespace().any(|c| c == class) {
523                            nodes.push(id);
524                            break;
525                        }
526                    }
527                }
528            }
529        }
530        Selection::new(self, nodes)
531    }
532
533    fn find_by_attr(&self, name: &str, value: &str) -> Selection<'_> {
534        let arena = self.arena();
535        let mut nodes = Vec::new();
536        for i in 0..arena.len() {
537            let id = NodeId(i as u32);
538            let attrs = arena.attrs(id);
539            for attr in attrs {
540                if arena.attr_name(attr).eq_ignore_ascii_case(name)
541                    && arena.attr_value(attr) == Some(value)
542                {
543                    nodes.push(id);
544                    break;
545                }
546            }
547        }
548        Selection::new(self, nodes)
549    }
550
551    fn xpath(&self, expr: &str) -> Result<XPathResult, XPathError> {
552        let parsed = xpath::parser::parse_xpath(expr)?;
553        Ok(xpath::eval::evaluate(&parsed, self.arena(), self.root_id()))
554    }
555}
556
557/// Pre-built index for O(1) id, class, and tag lookups.
558///
559/// Build once with a single DFS pass, reuse for many lookups.
560pub struct DocumentIndex {
561    id_map: HashMap<String, NodeId>,
562    class_map: HashMap<String, Vec<NodeId>>,
563    tag_map: HashMap<Tag, Vec<NodeId>>,
564}
565
566impl DocumentIndex {
567    /// Build an index from a document by scanning all nodes in a single pass.
568    ///
569    /// If the arena has a pre-built tag index (constructed inline during tree
570    /// building), the tag map is copied directly — avoiding the tag scan.
571    pub fn build(doc: &Document) -> Self {
572        let arena = doc.arena();
573        let mut id_map = HashMap::with_capacity(arena.len() / 8);
574        let mut class_map: HashMap<String, Vec<NodeId>> = HashMap::with_capacity(arena.len() / 4);
575
576        // Use pre-built tag index if available.
577        let tag_map = if let Some(pre_built) = arena.tag_index() {
578            let mut map = HashMap::with_capacity(64);
579            for (tag_u8, ids) in pre_built.iter().enumerate() {
580                let filtered_ids: Vec<_> = ids
581                    .iter()
582                    .copied()
583                    .filter(|&id| is_document_element(arena.get(id)))
584                    .collect();
585                if !filtered_ids.is_empty() {
586                    // SAFETY: Tag is repr(u8), but not all u8 values are valid.
587                    // We only insert entries that were created via new_element with
588                    // a valid Tag, so the cast is safe (values came from Tag as u8).
589                    let tag: Tag = unsafe { std::mem::transmute(tag_u8 as u8) };
590                    map.insert(tag, filtered_ids);
591                }
592            }
593            map
594        } else {
595            let mut map: HashMap<Tag, Vec<NodeId>> = HashMap::with_capacity(64);
596            for i in 0..arena.len() {
597                let node_id = NodeId(i as u32);
598                let n = arena.get(node_id);
599                if !is_document_element(n) {
600                    continue;
601                }
602                map.entry(n.tag).or_default().push(node_id);
603            }
604            map
605        };
606
607        // Build id and class maps — still requires attribute scan.
608        for i in 0..arena.len() {
609            let node_id = NodeId(i as u32);
610            let n = arena.get(node_id);
611
612            // Skip non-element nodes.
613            if !is_document_element(n) {
614                continue;
615            }
616
617            let attrs = arena.attrs(node_id);
618            for attr in attrs {
619                let attr_name = arena.attr_name(attr);
620                if attr_name.eq_ignore_ascii_case("id") {
621                    if let Some(val) = arena.attr_value(attr) {
622                        if let Some(existing) = id_map.get_mut(val) {
623                            *existing = node_id;
624                        } else {
625                            id_map.insert(val.to_owned(), node_id);
626                        }
627                    }
628                }
629                if attr_name.eq_ignore_ascii_case("class") {
630                    if let Some(val) = arena.attr_value(attr) {
631                        for class in val.split_whitespace() {
632                            if let Some(ids) = class_map.get_mut(class) {
633                                ids.push(node_id);
634                            } else {
635                                class_map.insert(class.to_owned(), vec![node_id]);
636                            }
637                        }
638                    }
639                }
640            }
641        }
642
643        Self {
644            id_map,
645            class_map,
646            tag_map,
647        }
648    }
649
650    /// Look up a node by its `id` attribute in O(1).
651    pub fn find_by_id<'a>(&self, doc: &'a Document, id: &str) -> Option<NodeRef<'a>> {
652        self.id_map.get(id).map(|&node_id| doc.get(node_id))
653    }
654
655    /// Look up all nodes with a given CSS class in O(1).
656    pub fn find_by_class<'a>(&self, doc: &'a Document, class: &str) -> Vec<NodeRef<'a>> {
657        self.class_map
658            .get(class)
659            .map(|ids| ids.iter().map(|&id| doc.get(id)).collect())
660            .unwrap_or_default()
661    }
662
663    /// Look up all nodes with a given tag in O(1).
664    pub fn find_by_tag<'a>(&self, doc: &'a Document, tag: Tag) -> Vec<NodeRef<'a>> {
665        self.tag_map
666            .get(&tag)
667            .map(|ids| ids.iter().map(|&id| doc.get(id)).collect())
668            .unwrap_or_default()
669    }
670}
671
672#[cfg(test)]
673mod tests {
674    use super::*;
675    use fhp_tree::parse;
676
677    #[test]
678    fn select_basic() {
679        let doc = parse("<div><p>Hello</p></div>").unwrap();
680        let sel = doc.select("p").unwrap();
681        assert_eq!(sel.len(), 1);
682        assert_eq!(sel.text(), "Hello");
683    }
684
685    #[test]
686    fn select_first_found() {
687        let doc = parse("<div><p>a</p><p>b</p></div>").unwrap();
688        let first = doc.select_first("p").unwrap();
689        assert!(first.is_some());
690        assert_eq!(first.unwrap().text_content(), "a");
691    }
692
693    #[test]
694    fn select_chaining() {
695        let doc = parse("<ul><li><a>1</a></li><li><a>2</a></li></ul>").unwrap();
696        let lis = doc.select("li").unwrap();
697        assert_eq!(lis.len(), 2);
698        let links = lis.select("a").unwrap();
699        assert_eq!(links.len(), 2);
700        assert_eq!(links.text(), "12");
701    }
702
703    #[test]
704    fn find_by_tag_works() {
705        let doc = parse("<div><span>a</span><span>b</span></div>").unwrap();
706        let sel = doc.find_by_tag(Tag::Span);
707        assert_eq!(sel.len(), 2);
708    }
709
710    #[test]
711    fn find_by_id_works() {
712        let doc = parse("<div id=\"main\">x</div><div>y</div>").unwrap();
713        let node = doc.find_by_id("main");
714        assert!(node.is_some());
715        assert_eq!(node.unwrap().text_content(), "x");
716    }
717
718    #[test]
719    fn find_by_id_missing() {
720        let doc = parse("<div>x</div>").unwrap();
721        assert!(doc.find_by_id("nope").is_none());
722    }
723
724    #[test]
725    fn find_by_class_works() {
726        let doc = parse("<div class=\"a b\">x</div><div class=\"c\">y</div>").unwrap();
727        let sel = doc.find_by_class("a");
728        assert_eq!(sel.len(), 1);
729        assert_eq!(sel.text(), "x");
730    }
731
732    #[test]
733    fn find_by_attr_works() {
734        let doc = parse("<a href=\"x\">a</a><a href=\"y\">b</a>").unwrap();
735        let sel = doc.find_by_attr("href", "x");
736        assert_eq!(sel.len(), 1);
737        assert_eq!(sel.text(), "a");
738    }
739
740    #[test]
741    fn selection_attr() {
742        let doc = parse("<a href=\"url\">link</a>").unwrap();
743        let sel = doc.select("a").unwrap();
744        assert_eq!(sel.attr("href"), Some("url"));
745    }
746
747    #[test]
748    fn selection_inner_html() {
749        let doc = parse("<div><p>Hello</p></div>").unwrap();
750        let sel = doc.select("div").unwrap();
751        assert_eq!(sel.inner_html(), "<p>Hello</p>");
752    }
753
754    #[test]
755    fn selection_empty() {
756        let doc = parse("<div>x</div>").unwrap();
757        let sel = doc.select("span").unwrap();
758        assert!(sel.is_empty());
759        assert_eq!(sel.len(), 0);
760        assert!(sel.first().is_none());
761    }
762
763    #[test]
764    fn document_index_o1() {
765        let doc = parse("<div id=\"a\">x</div><div id=\"b\">y</div>").unwrap();
766        let index = DocumentIndex::build(&doc);
767        let node = index.find_by_id(&doc, "b").unwrap();
768        assert_eq!(node.text_content(), "y");
769    }
770
771    #[test]
772    fn document_index_find_by_class() {
773        let doc = parse("<div class=\"a b\">x</div><span class=\"b c\">y</span><p>z</p>").unwrap();
774        let index = DocumentIndex::build(&doc);
775
776        let class_b = index.find_by_class(&doc, "b");
777        assert_eq!(class_b.len(), 2);
778
779        let class_a = index.find_by_class(&doc, "a");
780        assert_eq!(class_a.len(), 1);
781        assert_eq!(class_a[0].text_content(), "x");
782
783        let class_missing = index.find_by_class(&doc, "nope");
784        assert!(class_missing.is_empty());
785    }
786
787    #[test]
788    fn document_index_find_by_tag() {
789        let doc = parse("<div>a</div><div>b</div><span>c</span>").unwrap();
790        let index = DocumentIndex::build(&doc);
791
792        let divs = index.find_by_tag(&doc, Tag::Div);
793        assert_eq!(divs.len(), 2);
794
795        let spans = index.find_by_tag(&doc, Tag::Span);
796        assert_eq!(spans.len(), 1);
797        assert_eq!(spans[0].text_content(), "c");
798
799        let links = index.find_by_tag(&doc, Tag::A);
800        assert!(links.is_empty());
801    }
802
803    #[test]
804    fn document_index_handles_uppercase_html_attributes() {
805        let doc = parse("<div ID=\"main\" CLASS=\"active hero\">x</div>").unwrap();
806        let index = DocumentIndex::build(&doc);
807
808        assert_eq!(index.find_by_id(&doc, "main").unwrap().text_content(), "x");
809        assert_eq!(index.find_by_class(&doc, "active").len(), 1);
810        assert_eq!(index.find_by_class(&doc, "hero").len(), 1);
811    }
812
813    #[test]
814    fn selection_into_iter() {
815        let doc = parse("<div><p>a</p><p>b</p></div>").unwrap();
816        let sel = doc.select("p").unwrap();
817        let texts: Vec<String> = (&sel).into_iter().map(|n| n.text_content()).collect();
818        assert_eq!(texts, vec!["a", "b"]);
819    }
820
821    #[test]
822    fn xpath_descendant() {
823        let doc = parse("<div><p>Hello</p></div>").unwrap();
824        let result = doc.xpath("//p").unwrap();
825        match result {
826            XPathResult::Nodes(nodes) => assert_eq!(nodes.len(), 1),
827            _ => panic!("expected Nodes"),
828        }
829    }
830
831    #[test]
832    fn xpath_text_extract() {
833        let doc = parse("<div><p>Hello</p></div>").unwrap();
834        let result = doc.xpath("//p/text()").unwrap();
835        match result {
836            XPathResult::Strings(texts) => {
837                assert_eq!(texts.len(), 1);
838                assert_eq!(texts[0], "Hello");
839            }
840            _ => panic!("expected Strings"),
841        }
842    }
843
844    #[test]
845    fn xpath_invalid() {
846        let doc = parse("<div>x</div>").unwrap();
847        assert!(doc.xpath("").is_err());
848        assert!(doc.xpath("bad").is_err());
849    }
850
851    #[test]
852    fn selection_xpath_chaining() {
853        let doc = parse("<ul><li>1</li><li>2</li></ul><ol><li>3</li></ol>").unwrap();
854        let sel = doc.select("ul").unwrap();
855        let result = sel.xpath("//li").unwrap();
856        match result {
857            XPathResult::Nodes(nodes) => assert_eq!(nodes.len(), 2),
858            _ => panic!("expected Nodes"),
859        }
860    }
861
862    #[test]
863    fn compiled_selector_basic() {
864        let sel = CompiledSelector::new("p").unwrap();
865        let doc = parse("<div><p>Hello</p></div>").unwrap();
866        let results = doc.select_compiled(&sel).unwrap();
867        assert_eq!(results.len(), 1);
868        assert_eq!(results.text(), "Hello");
869    }
870
871    #[test]
872    fn compiled_selector_first() {
873        let sel = CompiledSelector::new("p").unwrap();
874        let doc = parse("<div><p>a</p><p>b</p></div>").unwrap();
875        let first = doc.select_first_compiled(&sel).unwrap();
876        assert!(first.is_some());
877        assert_eq!(first.unwrap().text_content(), "a");
878    }
879
880    #[test]
881    fn compiled_selector_reuse_across_docs() {
882        let sel = CompiledSelector::new("span.active").unwrap();
883        let doc1 = parse("<span class=\"active\">one</span>").unwrap();
884        let doc2 = parse("<div><span class=\"active\">two</span></div>").unwrap();
885        assert_eq!(doc1.select_compiled(&sel).unwrap().text(), "one");
886        assert_eq!(doc2.select_compiled(&sel).unwrap().text(), "two");
887    }
888
889    #[test]
890    fn compiled_selector_chaining() {
891        let sel = CompiledSelector::new("a").unwrap();
892        let doc = parse("<ul><li><a>1</a></li><li><a>2</a></li></ul>").unwrap();
893        let lis = doc.select("li").unwrap();
894        let links = lis.select_compiled(&sel).unwrap();
895        assert_eq!(links.len(), 2);
896    }
897
898    #[test]
899    fn compiled_selector_invalid() {
900        assert!(CompiledSelector::new("").is_err());
901    }
902
903    #[test]
904    fn compiled_selector_clone() {
905        let sel = CompiledSelector::new("div").unwrap();
906        let sel2 = sel.clone();
907        let doc = parse("<div>ok</div>").unwrap();
908        assert_eq!(doc.select_compiled(&sel2).unwrap().len(), 1);
909    }
910
911}