Skip to main content

scrape_core/dom/
document.rs

1//! Document container and tree operations.
2
3use std::{collections::HashMap, marker::PhantomData};
4
5use super::{
6    arena::Arena,
7    index::DocumentIndex,
8    node::{Node, NodeId, NodeKind},
9    state::{Building, DocumentState, MutableState, Queryable, QueryableState, Sealed},
10};
11
12/// An HTML document containing a tree of nodes.
13///
14/// The document owns all nodes via an arena allocator, ensuring
15/// cache-friendly contiguous storage. Navigation is performed
16/// using [`NodeId`] handles.
17///
18/// # Architecture
19///
20/// Nodes are stored in a single contiguous `Arena<Node>`. Parent-child
21/// relationships use `first_child`/`last_child` links (O(1) append),
22/// and siblings are doubly linked via `prev_sibling`/`next_sibling`.
23///
24/// # Navigation
25///
26/// The document provides both direct navigation methods and lazy iterators:
27///
28/// - [`parent`](Document::parent), [`first_child`](Document::first_child),
29///   [`last_child`](Document::last_child) - direct links
30/// - [`children`](Document::children) - iterate over direct children
31/// - [`ancestors`](Document::ancestors) - iterate from parent to root
32/// - [`descendants`](Document::descendants) - depth-first subtree traversal
33///
34/// # Typestate
35///
36/// Internally, the document uses typestate pattern to enforce lifecycle
37/// guarantees at compile time. The type parameter `S` tracks whether
38/// the document is being built, is queryable, or is sealed.
39#[derive(Debug)]
40pub struct DocumentImpl<S: DocumentState = Queryable> {
41    arena: Arena<Node>,
42    root: Option<NodeId>,
43    index: Option<DocumentIndex>,
44    _state: PhantomData<S>,
45}
46
47/// Public alias for backward compatibility.
48///
49/// The public `Document` type always refers to a queryable document.
50/// Internally, we use `DocumentImpl<S>` for typestate enforcement.
51pub type Document = DocumentImpl<Queryable>;
52
53// ==================== Default Implementations ====================
54
55impl Default for DocumentImpl<Building> {
56    fn default() -> Self {
57        Self::new()
58    }
59}
60
61impl Default for Document {
62    fn default() -> Self {
63        DocumentImpl::<Building>::new().build()
64    }
65}
66
67// ==================== Building State ====================
68
69impl DocumentImpl<Building> {
70    /// Creates a new empty document in building state.
71    ///
72    /// The default capacity is 256 nodes, which is sufficient for typical HTML pages
73    /// and reduces reallocations during parsing.
74    #[must_use]
75    pub fn new() -> Self {
76        Self::with_capacity(256)
77    }
78
79    /// Creates a new empty document with the specified capacity.
80    ///
81    /// Use this when you know the approximate number of nodes to avoid reallocations.
82    #[must_use]
83    pub fn with_capacity(capacity: usize) -> Self {
84        Self { arena: Arena::with_capacity(capacity), root: None, index: None, _state: PhantomData }
85    }
86
87    /// Sets the root node ID.
88    pub fn set_root(&mut self, id: NodeId) {
89        self.root = Some(id);
90    }
91
92    /// Creates a new element node and returns its ID.
93    pub fn create_element(
94        &mut self,
95        name: impl Into<String>,
96        attributes: HashMap<String, String>,
97    ) -> NodeId {
98        NodeId::new(self.arena.alloc(Node::element(name, attributes)))
99    }
100
101    /// Creates a new text node and returns its ID.
102    pub fn create_text(&mut self, content: impl Into<String>) -> NodeId {
103        NodeId::new(self.arena.alloc(Node::text(content)))
104    }
105
106    /// Creates a new comment node and returns its ID.
107    pub fn create_comment(&mut self, content: impl Into<String>) -> NodeId {
108        NodeId::new(self.arena.alloc(Node::comment(content)))
109    }
110
111    /// Appends a child node to a parent.
112    ///
113    /// Updates parent, `first_child`, `last_child`, and sibling links.
114    ///
115    /// # Panics
116    ///
117    /// Panics in debug builds if `parent_id` or `child_id` are invalid.
118    pub fn append_child(&mut self, parent_id: NodeId, child_id: NodeId) {
119        debug_assert!(parent_id.index() < self.arena.len(), "Invalid parent_id");
120        debug_assert!(child_id.index() < self.arena.len(), "Invalid child_id");
121
122        // Get current last child of parent
123        let prev_last = self.arena.get(parent_id.index()).and_then(|p| p.last_child);
124
125        // Update child's parent and prev_sibling
126        if let Some(child) = self.arena.get_mut(child_id.index()) {
127            child.parent = Some(parent_id);
128            child.prev_sibling = prev_last;
129            child.next_sibling = None;
130        }
131
132        // Update previous last child's next_sibling
133        if let Some(prev_id) = prev_last
134            && let Some(prev) = self.arena.get_mut(prev_id.index())
135        {
136            prev.next_sibling = Some(child_id);
137        }
138
139        // Update parent's first_child (if first) and last_child
140        if let Some(parent) = self.arena.get_mut(parent_id.index()) {
141            if parent.first_child.is_none() {
142                parent.first_child = Some(child_id);
143            }
144            parent.last_child = Some(child_id);
145        }
146    }
147
148    /// Inserts `new_child` immediately before `sibling` in the parent's child list.
149    ///
150    /// `sibling` must have a parent; `new_child` must not already be in the tree.
151    pub fn insert_before(&mut self, sibling: NodeId, new_child: NodeId) {
152        let Some(parent) = self.arena.get(sibling.index()).and_then(|n| n.parent) else {
153            return;
154        };
155
156        let prev = self.arena.get(sibling.index()).and_then(|n| n.prev_sibling);
157
158        // Wire new_child into the chain
159        if let Some(node) = self.arena.get_mut(new_child.index()) {
160            node.parent = Some(parent);
161            node.prev_sibling = prev;
162            node.next_sibling = Some(sibling);
163        }
164
165        // Update previous sibling's next pointer (or parent's first_child)
166        if let Some(prev_id) = prev {
167            if let Some(prev_node) = self.arena.get_mut(prev_id.index()) {
168                prev_node.next_sibling = Some(new_child);
169            }
170        } else if let Some(parent_node) = self.arena.get_mut(parent.index()) {
171            parent_node.first_child = Some(new_child);
172        }
173
174        // Update sibling's prev pointer
175        if let Some(sib) = self.arena.get_mut(sibling.index()) {
176            sib.prev_sibling = Some(new_child);
177        }
178    }
179
180    /// Detaches `target` from its parent, fixing up sibling and parent links.
181    pub fn remove_from_parent(&mut self, target: NodeId) {
182        let Some(parent) = self.arena.get(target.index()).and_then(|n| n.parent) else {
183            return;
184        };
185        let prev = self.arena.get(target.index()).and_then(|n| n.prev_sibling);
186        let next = self.arena.get(target.index()).and_then(|n| n.next_sibling);
187
188        // Bridge prev <-> next
189        if let Some(prev_id) = prev {
190            if let Some(prev_node) = self.arena.get_mut(prev_id.index()) {
191                prev_node.next_sibling = next;
192            }
193        } else if let Some(parent_node) = self.arena.get_mut(parent.index()) {
194            parent_node.first_child = next;
195        }
196
197        if let Some(next_id) = next {
198            if let Some(next_node) = self.arena.get_mut(next_id.index()) {
199                next_node.prev_sibling = prev;
200            }
201        } else if let Some(parent_node) = self.arena.get_mut(parent.index()) {
202            parent_node.last_child = prev;
203        }
204
205        // Detach target
206        if let Some(node) = self.arena.get_mut(target.index()) {
207            node.parent = None;
208            node.prev_sibling = None;
209            node.next_sibling = None;
210        }
211    }
212
213    /// Moves all children of `src` to become the last children of `dst`.
214    pub fn reparent_children(&mut self, src: NodeId, dst: NodeId) {
215        // Collect children of src first to avoid borrow conflicts
216        let mut child = self.arena.get(src.index()).and_then(|n| n.first_child);
217        while let Some(child_id) = child {
218            let next = self.arena.get(child_id.index()).and_then(|n| n.next_sibling);
219            // Disconnect from src chain first
220            if let Some(node) = self.arena.get_mut(child_id.index()) {
221                node.parent = None;
222                node.prev_sibling = None;
223                node.next_sibling = None;
224            }
225            self.append_child(dst, child_id);
226            child = next;
227        }
228        // Clear src's child pointers
229        if let Some(node) = self.arena.get_mut(src.index()) {
230            node.first_child = None;
231            node.last_child = None;
232        }
233    }
234
235    /// Appends text to the last child of `parent` if it is a text node;
236    /// returns `true` when the text was merged, `false` when a new node is needed.
237    pub fn try_append_text_to_last_child(&mut self, parent: NodeId, text: &str) -> bool {
238        let Some(last) = self.arena.get(parent.index()).and_then(|n| n.last_child) else {
239            return false;
240        };
241        if let Some(node) = self.arena.get_mut(last.index())
242            && let NodeKind::Text { content } = &mut node.kind
243        {
244            content.push_str(text);
245            return true;
246        }
247        false
248    }
249
250    /// Appends text to `node` itself if it is a text node;
251    /// returns `true` when the text was merged, `false` when a new node is needed.
252    pub fn try_append_text_to_node(&mut self, node: NodeId, text: &str) -> bool {
253        if let Some(n) = self.arena.get_mut(node.index())
254            && let NodeKind::Text { content } = &mut n.kind
255        {
256            content.push_str(text);
257            return true;
258        }
259        false
260    }
261
262    /// Transitions the document from Building to Queryable state.
263    ///
264    /// This is a one-way transition. Once built, the document structure
265    /// cannot be modified.
266    ///
267    /// # Examples
268    ///
269    /// ```rust
270    /// use std::collections::HashMap;
271    ///
272    /// use scrape_core::{Building, DocumentImpl};
273    ///
274    /// let mut doc = DocumentImpl::<Building>::new();
275    /// let root = doc.create_element("div", HashMap::new());
276    /// doc.set_root(root);
277    ///
278    /// // Transition to queryable state
279    /// let doc = doc.build();
280    /// ```
281    #[must_use]
282    pub fn build(self) -> DocumentImpl<Queryable> {
283        DocumentImpl { arena: self.arena, root: self.root, index: self.index, _state: PhantomData }
284    }
285}
286
287// ==================== Queryable State ====================
288
289impl DocumentImpl<Queryable> {
290    /// Creates a new empty document in queryable state.
291    ///
292    /// This is a convenience method for backward compatibility.
293    /// Internally, it creates a Building document and immediately transitions to Queryable.
294    #[must_use]
295    pub fn new() -> Self {
296        DocumentImpl::<Building>::new().build()
297    }
298
299    /// Creates a new empty document with the specified capacity in queryable state.
300    ///
301    /// This is a convenience method for backward compatibility.
302    #[must_use]
303    pub fn with_capacity(capacity: usize) -> Self {
304        DocumentImpl::<Building>::with_capacity(capacity).build()
305    }
306
307    /// Sets the root node ID.
308    ///
309    /// Available on Queryable for backward compatibility with tests.
310    pub fn set_root(&mut self, id: NodeId) {
311        self.root = Some(id);
312    }
313
314    /// Creates a new element node and returns its ID.
315    ///
316    /// Available on Queryable for backward compatibility with tests.
317    pub fn create_element(
318        &mut self,
319        name: impl Into<String>,
320        attributes: HashMap<String, String>,
321    ) -> NodeId {
322        NodeId::new(self.arena.alloc(Node::element(name, attributes)))
323    }
324
325    /// Creates a new text node and returns its ID.
326    ///
327    /// Available on Queryable for backward compatibility with tests.
328    pub fn create_text(&mut self, content: impl Into<String>) -> NodeId {
329        NodeId::new(self.arena.alloc(Node::text(content)))
330    }
331
332    /// Creates a new comment node and returns its ID.
333    ///
334    /// Available on Queryable for backward compatibility with tests.
335    pub fn create_comment(&mut self, content: impl Into<String>) -> NodeId {
336        NodeId::new(self.arena.alloc(Node::comment(content)))
337    }
338
339    /// Appends a child node to a parent.
340    ///
341    /// Available on Queryable for backward compatibility with tests.
342    ///
343    /// Updates parent, `first_child`, `last_child`, and sibling links.
344    ///
345    /// # Panics
346    ///
347    /// Panics in debug builds if `parent_id` or `child_id` are invalid.
348    pub fn append_child(&mut self, parent_id: NodeId, child_id: NodeId) {
349        debug_assert!(parent_id.index() < self.arena.len(), "Invalid parent_id");
350        debug_assert!(child_id.index() < self.arena.len(), "Invalid child_id");
351
352        // Get current last child of parent
353        let prev_last = self.arena.get(parent_id.index()).and_then(|p| p.last_child);
354
355        // Update child's parent and prev_sibling
356        if let Some(child) = self.arena.get_mut(child_id.index()) {
357            child.parent = Some(parent_id);
358            child.prev_sibling = prev_last;
359            child.next_sibling = None;
360        }
361
362        // Update previous last child's next_sibling
363        if let Some(prev_id) = prev_last
364            && let Some(prev) = self.arena.get_mut(prev_id.index())
365        {
366            prev.next_sibling = Some(child_id);
367        }
368
369        // Update parent's first_child (if first) and last_child
370        if let Some(parent) = self.arena.get_mut(parent_id.index()) {
371            if parent.first_child.is_none() {
372                parent.first_child = Some(child_id);
373            }
374            parent.last_child = Some(child_id);
375        }
376    }
377
378    /// Seals the document, preventing any future modifications.
379    ///
380    /// This is a one-way transition for when you need to guarantee
381    /// the document will never change.
382    #[must_use]
383    pub fn seal(self) -> DocumentImpl<Sealed> {
384        DocumentImpl { arena: self.arena, root: self.root, index: self.index, _state: PhantomData }
385    }
386
387    /// Sets the document index.
388    ///
389    /// Only available in Queryable state since index would be invalidated
390    /// by structural modifications.
391    pub fn set_index(&mut self, index: DocumentIndex) {
392        self.index = Some(index);
393    }
394}
395
396// ==================== Shared Methods (All States) ====================
397
398impl<S: DocumentState> DocumentImpl<S> {
399    /// Returns the root node ID, if any.
400    #[must_use]
401    pub fn root(&self) -> Option<NodeId> {
402        self.root
403    }
404
405    /// Returns a reference to the node with the given ID.
406    #[inline]
407    #[must_use]
408    pub fn get(&self, id: NodeId) -> Option<&Node> {
409        self.arena.get(id.index())
410    }
411
412    /// Returns the number of nodes in the document.
413    #[must_use]
414    pub fn len(&self) -> usize {
415        self.arena.len()
416    }
417
418    /// Returns `true` if the document has no nodes.
419    #[must_use]
420    pub fn is_empty(&self) -> bool {
421        self.arena.is_empty()
422    }
423
424    /// Returns an iterator over all nodes.
425    pub fn nodes(&self) -> impl Iterator<Item = (NodeId, &Node)> {
426        self.arena.iter().map(|(i, node)| (NodeId::new(i), node))
427    }
428}
429
430// ==================== Mutable State Methods ====================
431
432impl<S: MutableState> DocumentImpl<S> {
433    /// Returns a mutable reference to the node with the given ID.
434    ///
435    /// Only available for documents in mutable states (Building).
436    #[inline]
437    #[must_use]
438    pub fn get_mut(&mut self, id: NodeId) -> Option<&mut Node> {
439        self.arena.get_mut(id.index())
440    }
441}
442
443// ==================== Queryable State Methods ====================
444
445impl<S: QueryableState> DocumentImpl<S> {
446    /// Returns the document index, if built.
447    ///
448    /// Only available in queryable states (Queryable, Sealed).
449    #[must_use]
450    pub fn index(&self) -> Option<&DocumentIndex> {
451        self.index.as_ref()
452    }
453}
454
455// ==================== Navigation APIs ====================
456
457impl<S: DocumentState> DocumentImpl<S> {
458    // ==================== Navigation APIs ====================
459
460    /// Returns the parent of a node.
461    #[inline]
462    #[must_use]
463    pub fn parent(&self, id: NodeId) -> Option<NodeId> {
464        self.arena.get(id.index()).and_then(|n| n.parent)
465    }
466
467    /// Returns the first child of a node.
468    #[inline]
469    #[must_use]
470    pub fn first_child(&self, id: NodeId) -> Option<NodeId> {
471        self.arena.get(id.index()).and_then(|n| n.first_child)
472    }
473
474    /// Returns the last child of a node.
475    #[inline]
476    #[must_use]
477    pub fn last_child(&self, id: NodeId) -> Option<NodeId> {
478        self.arena.get(id.index()).and_then(|n| n.last_child)
479    }
480
481    /// Returns the next sibling of a node.
482    #[inline]
483    #[must_use]
484    pub fn next_sibling(&self, id: NodeId) -> Option<NodeId> {
485        self.arena.get(id.index()).and_then(|n| n.next_sibling)
486    }
487
488    /// Returns the previous sibling of a node.
489    #[inline]
490    #[must_use]
491    pub fn prev_sibling(&self, id: NodeId) -> Option<NodeId> {
492        self.arena.get(id.index()).and_then(|n| n.prev_sibling)
493    }
494
495    /// Returns an iterator over children of a node.
496    ///
497    /// The iterator yields children in order from first to last.
498    ///
499    /// # Examples
500    ///
501    /// ```
502    /// use std::collections::HashMap;
503    ///
504    /// use scrape_core::Document;
505    ///
506    /// let mut doc = Document::new();
507    /// let parent = doc.create_element("div", HashMap::new());
508    /// let child1 = doc.create_element("span", HashMap::new());
509    /// let child2 = doc.create_element("span", HashMap::new());
510    ///
511    /// doc.append_child(parent, child1);
512    /// doc.append_child(parent, child2);
513    ///
514    /// let children: Vec<_> = doc.children(parent).collect();
515    /// assert_eq!(children.len(), 2);
516    /// ```
517    #[must_use]
518    pub fn children(&self, id: NodeId) -> ChildrenIter<'_, S> {
519        ChildrenIter { doc: self, current: self.first_child(id) }
520    }
521
522    /// Returns an iterator over ancestors of a node.
523    ///
524    /// The iterator yields ancestors from parent to root (does not include the node itself).
525    ///
526    /// # Examples
527    ///
528    /// ```
529    /// use std::collections::HashMap;
530    ///
531    /// use scrape_core::Document;
532    ///
533    /// let mut doc = Document::new();
534    /// let grandparent = doc.create_element("html", HashMap::new());
535    /// let parent = doc.create_element("body", HashMap::new());
536    /// let child = doc.create_element("div", HashMap::new());
537    ///
538    /// doc.append_child(grandparent, parent);
539    /// doc.append_child(parent, child);
540    ///
541    /// let ancestors: Vec<_> = doc.ancestors(child).collect();
542    /// assert_eq!(ancestors.len(), 2); // parent, grandparent
543    /// ```
544    #[must_use]
545    pub fn ancestors(&self, id: NodeId) -> AncestorsIter<'_, S> {
546        AncestorsIter { doc: self, current: self.parent(id) }
547    }
548
549    /// Returns an iterator over descendants in depth-first pre-order.
550    ///
551    /// Does not include the starting node itself.
552    ///
553    /// # Examples
554    ///
555    /// ```
556    /// use std::collections::HashMap;
557    ///
558    /// use scrape_core::Document;
559    ///
560    /// let mut doc = Document::new();
561    /// let root = doc.create_element("html", HashMap::new());
562    /// let child1 = doc.create_element("head", HashMap::new());
563    /// let child2 = doc.create_element("body", HashMap::new());
564    /// let grandchild = doc.create_element("div", HashMap::new());
565    ///
566    /// doc.append_child(root, child1);
567    /// doc.append_child(root, child2);
568    /// doc.append_child(child2, grandchild);
569    ///
570    /// let descendants: Vec<_> = doc.descendants(root).collect();
571    /// assert_eq!(descendants.len(), 3); // head, body, div
572    /// ```
573    #[must_use]
574    pub fn descendants(&self, id: NodeId) -> DescendantsIter<'_, S> {
575        DescendantsIter { doc: self, root: id, stack: vec![id], started: false }
576    }
577
578    /// Returns an iterator over siblings following a node.
579    ///
580    /// Does not include the node itself.
581    ///
582    /// # Examples
583    ///
584    /// ```
585    /// use std::collections::HashMap;
586    ///
587    /// use scrape_core::Document;
588    ///
589    /// let mut doc = Document::new();
590    /// let parent = doc.create_element("ul", HashMap::new());
591    /// let child1 = doc.create_element("li", HashMap::new());
592    /// let child2 = doc.create_element("li", HashMap::new());
593    /// let child3 = doc.create_element("li", HashMap::new());
594    ///
595    /// doc.append_child(parent, child1);
596    /// doc.append_child(parent, child2);
597    /// doc.append_child(parent, child3);
598    ///
599    /// let next: Vec<_> = doc.next_siblings(child1).collect();
600    /// assert_eq!(next.len(), 2); // child2, child3
601    /// ```
602    #[must_use]
603    pub fn next_siblings(&self, id: NodeId) -> NextSiblingsIter<'_, S> {
604        NextSiblingsIter { doc: self, current: self.next_sibling(id) }
605    }
606
607    /// Returns an iterator over siblings preceding a node.
608    ///
609    /// Does not include the node itself. Iterates in reverse order
610    /// (from immediate predecessor toward first sibling).
611    ///
612    /// # Examples
613    ///
614    /// ```
615    /// use std::collections::HashMap;
616    ///
617    /// use scrape_core::Document;
618    ///
619    /// let mut doc = Document::new();
620    /// let parent = doc.create_element("ul", HashMap::new());
621    /// let child1 = doc.create_element("li", HashMap::new());
622    /// let child2 = doc.create_element("li", HashMap::new());
623    /// let child3 = doc.create_element("li", HashMap::new());
624    ///
625    /// doc.append_child(parent, child1);
626    /// doc.append_child(parent, child2);
627    /// doc.append_child(parent, child3);
628    ///
629    /// let prev: Vec<_> = doc.prev_siblings(child3).collect();
630    /// assert_eq!(prev.len(), 2); // child2, child1 (reverse order)
631    /// ```
632    #[must_use]
633    pub fn prev_siblings(&self, id: NodeId) -> PrevSiblingsIter<'_, S> {
634        PrevSiblingsIter { doc: self, current: self.prev_sibling(id) }
635    }
636
637    /// Returns an iterator over all siblings of a node (excluding the node itself).
638    ///
639    /// Iterates in document order from first sibling to last.
640    ///
641    /// # Examples
642    ///
643    /// ```
644    /// use std::collections::HashMap;
645    ///
646    /// use scrape_core::Document;
647    ///
648    /// let mut doc = Document::new();
649    /// let parent = doc.create_element("ul", HashMap::new());
650    /// let child1 = doc.create_element("li", HashMap::new());
651    /// let child2 = doc.create_element("li", HashMap::new());
652    /// let child3 = doc.create_element("li", HashMap::new());
653    ///
654    /// doc.append_child(parent, child1);
655    /// doc.append_child(parent, child2);
656    /// doc.append_child(parent, child3);
657    ///
658    /// let siblings: Vec<_> = doc.siblings(child2).collect();
659    /// assert_eq!(siblings.len(), 2); // child1, child3
660    /// ```
661    #[must_use]
662    pub fn siblings(&self, id: NodeId) -> SiblingsIter<'_, S> {
663        // Find first sibling by going to parent's first child
664        let first = self.parent(id).and_then(|p| self.first_child(p));
665        SiblingsIter { doc: self, current: first, exclude: id }
666    }
667}
668
669/// Iterator over direct children of a node.
670///
671/// Created by [`Document::children`].
672#[derive(Debug)]
673pub struct ChildrenIter<'a, S: DocumentState = Queryable> {
674    doc: &'a DocumentImpl<S>,
675    current: Option<NodeId>,
676}
677
678impl<S: DocumentState> Iterator for ChildrenIter<'_, S> {
679    type Item = NodeId;
680
681    fn next(&mut self) -> Option<Self::Item> {
682        let current = self.current?;
683        self.current = self.doc.next_sibling(current);
684        Some(current)
685    }
686}
687
688/// Iterator over ancestors of a node (parent, grandparent, ...).
689///
690/// Created by [`Document::ancestors`].
691#[derive(Debug)]
692pub struct AncestorsIter<'a, S: DocumentState = Queryable> {
693    doc: &'a DocumentImpl<S>,
694    current: Option<NodeId>,
695}
696
697impl<S: DocumentState> Iterator for AncestorsIter<'_, S> {
698    type Item = NodeId;
699
700    fn next(&mut self) -> Option<Self::Item> {
701        let current = self.current?;
702        self.current = self.doc.parent(current);
703        Some(current)
704    }
705}
706
707/// Iterator over descendants in depth-first pre-order.
708///
709/// Created by [`Document::descendants`].
710#[derive(Debug)]
711pub struct DescendantsIter<'a, S: DocumentState = Queryable> {
712    doc: &'a DocumentImpl<S>,
713    root: NodeId,
714    stack: Vec<NodeId>,
715    started: bool,
716}
717
718impl<S: DocumentState> Iterator for DescendantsIter<'_, S> {
719    type Item = NodeId;
720
721    fn next(&mut self) -> Option<Self::Item> {
722        if !self.started {
723            self.started = true;
724            if let Some(first) = self.doc.first_child(self.root) {
725                self.stack.clear();
726                self.stack.push(first);
727            } else {
728                return None;
729            }
730        }
731
732        let current = self.stack.pop()?;
733
734        // Push next sibling first (so it's processed after children)
735        if let Some(next) = self.doc.next_sibling(current) {
736            self.stack.push(next);
737        }
738
739        // Push first child (will be processed next - depth-first)
740        if let Some(child) = self.doc.first_child(current) {
741            self.stack.push(child);
742        }
743
744        Some(current)
745    }
746}
747
748/// Iterator over siblings following a node.
749///
750/// Created by [`Document::next_siblings`].
751#[derive(Debug)]
752pub struct NextSiblingsIter<'a, S: DocumentState = Queryable> {
753    doc: &'a DocumentImpl<S>,
754    current: Option<NodeId>,
755}
756
757impl<S: DocumentState> Iterator for NextSiblingsIter<'_, S> {
758    type Item = NodeId;
759
760    fn next(&mut self) -> Option<Self::Item> {
761        let current = self.current?;
762        self.current = self.doc.next_sibling(current);
763        Some(current)
764    }
765}
766
767/// Iterator over siblings preceding a node.
768///
769/// Created by [`Document::prev_siblings`].
770#[derive(Debug)]
771pub struct PrevSiblingsIter<'a, S: DocumentState = Queryable> {
772    doc: &'a DocumentImpl<S>,
773    current: Option<NodeId>,
774}
775
776impl<S: DocumentState> Iterator for PrevSiblingsIter<'_, S> {
777    type Item = NodeId;
778
779    fn next(&mut self) -> Option<Self::Item> {
780        let current = self.current?;
781        self.current = self.doc.prev_sibling(current);
782        Some(current)
783    }
784}
785
786/// Iterator over all siblings of a node (excluding the node itself).
787///
788/// Created by [`Document::siblings`].
789/// Iterates in document order (from first sibling to last).
790#[derive(Debug)]
791pub struct SiblingsIter<'a, S: DocumentState = Queryable> {
792    doc: &'a DocumentImpl<S>,
793    current: Option<NodeId>,
794    exclude: NodeId,
795}
796
797impl<S: DocumentState> Iterator for SiblingsIter<'_, S> {
798    type Item = NodeId;
799
800    fn next(&mut self) -> Option<Self::Item> {
801        loop {
802            let current = self.current?;
803            self.current = self.doc.next_sibling(current);
804            if current != self.exclude {
805                return Some(current);
806            }
807        }
808    }
809}
810
811// ==================== Iterator Extensions ====================
812
813impl<'a, S: DocumentState> ChildrenIter<'a, S> {
814    /// Returns an iterator over only element children (skipping text/comment nodes).
815    ///
816    /// This is more efficient than using `.filter()` externally because
817    /// it avoids closure overhead and can be specialized.
818    ///
819    /// # Examples
820    ///
821    /// ```rust
822    /// use scrape_core::Soup;
823    ///
824    /// let soup = Soup::parse("<div>text<span>A</span>more<span>B</span></div>");
825    /// let div = soup.find("div").unwrap().unwrap();
826    /// let doc = soup.document();
827    ///
828    /// // Only elements, skipping text nodes
829    /// let count = doc.children(div.node_id()).elements().count();
830    /// assert_eq!(count, 2); // span, span
831    /// ```
832    #[must_use]
833    pub fn elements(self) -> ElementChildrenIter<'a, S> {
834        ElementChildrenIter { inner: self }
835    }
836}
837
838/// Iterator over element children only.
839///
840/// Created by [`ChildrenIter::elements`].
841#[derive(Debug)]
842pub struct ElementChildrenIter<'a, S: DocumentState = Queryable> {
843    inner: ChildrenIter<'a, S>,
844}
845
846impl<S: DocumentState> Iterator for ElementChildrenIter<'_, S> {
847    type Item = NodeId;
848
849    #[inline]
850    fn next(&mut self) -> Option<Self::Item> {
851        loop {
852            let id = self.inner.next()?;
853            if self.inner.doc.get(id).is_some_and(|n| n.kind.is_element()) {
854                return Some(id);
855            }
856        }
857    }
858}
859
860impl<'a, S: DocumentState> DescendantsIter<'a, S> {
861    /// Returns an iterator over only element descendants (skipping text/comment nodes).
862    ///
863    /// # Examples
864    ///
865    /// ```rust
866    /// use scrape_core::Soup;
867    ///
868    /// let soup = Soup::parse("<div>text<p>para</p></div>");
869    /// let div = soup.find("div").unwrap().unwrap();
870    /// let doc = soup.document();
871    ///
872    /// let count = doc.descendants(div.node_id()).elements().count();
873    /// assert_eq!(count, 1); // p only
874    /// ```
875    #[must_use]
876    pub fn elements(self) -> ElementDescendantsIter<'a, S> {
877        ElementDescendantsIter { inner: self }
878    }
879}
880
881/// Iterator over element descendants only.
882///
883/// Created by [`DescendantsIter::elements`].
884#[derive(Debug)]
885pub struct ElementDescendantsIter<'a, S: DocumentState = Queryable> {
886    inner: DescendantsIter<'a, S>,
887}
888
889impl<S: DocumentState> Iterator for ElementDescendantsIter<'_, S> {
890    type Item = NodeId;
891
892    #[inline]
893    fn next(&mut self) -> Option<Self::Item> {
894        loop {
895            let id = self.inner.next()?;
896            if self.inner.doc.get(id).is_some_and(|n| n.kind.is_element()) {
897                return Some(id);
898            }
899        }
900    }
901}
902
903impl<'a, S: DocumentState> AncestorsIter<'a, S> {
904    /// Returns an iterator over only element ancestors (skipping non-element nodes).
905    ///
906    /// In practice, ancestors are almost always elements, but this method
907    /// provides consistent API with other iterators.
908    #[must_use]
909    pub fn elements(self) -> ElementAncestorsIter<'a, S> {
910        ElementAncestorsIter { inner: self }
911    }
912}
913
914/// Iterator over element ancestors only.
915///
916/// Created by [`AncestorsIter::elements`].
917#[derive(Debug)]
918pub struct ElementAncestorsIter<'a, S: DocumentState = Queryable> {
919    inner: AncestorsIter<'a, S>,
920}
921
922impl<S: DocumentState> Iterator for ElementAncestorsIter<'_, S> {
923    type Item = NodeId;
924
925    #[inline]
926    fn next(&mut self) -> Option<Self::Item> {
927        loop {
928            let id = self.inner.next()?;
929            if self.inner.doc.get(id).is_some_and(|n| n.kind.is_element()) {
930                return Some(id);
931            }
932        }
933    }
934}
935
936impl<'a, S: DocumentState> NextSiblingsIter<'a, S> {
937    /// Returns an iterator over only element siblings (skipping text/comment nodes).
938    #[must_use]
939    pub fn elements(self) -> ElementNextSiblingsIter<'a, S> {
940        ElementNextSiblingsIter { inner: self }
941    }
942}
943
944/// Iterator over next element siblings only.
945///
946/// Created by [`NextSiblingsIter::elements`].
947#[derive(Debug)]
948pub struct ElementNextSiblingsIter<'a, S: DocumentState = Queryable> {
949    inner: NextSiblingsIter<'a, S>,
950}
951
952impl<S: DocumentState> Iterator for ElementNextSiblingsIter<'_, S> {
953    type Item = NodeId;
954
955    #[inline]
956    fn next(&mut self) -> Option<Self::Item> {
957        loop {
958            let id = self.inner.next()?;
959            if self.inner.doc.get(id).is_some_and(|n| n.kind.is_element()) {
960                return Some(id);
961            }
962        }
963    }
964}
965
966impl<'a, S: DocumentState> PrevSiblingsIter<'a, S> {
967    /// Returns an iterator over only element siblings (skipping text/comment nodes).
968    #[must_use]
969    pub fn elements(self) -> ElementPrevSiblingsIter<'a, S> {
970        ElementPrevSiblingsIter { inner: self }
971    }
972}
973
974/// Iterator over previous element siblings only.
975///
976/// Created by [`PrevSiblingsIter::elements`].
977#[derive(Debug)]
978pub struct ElementPrevSiblingsIter<'a, S: DocumentState = Queryable> {
979    inner: PrevSiblingsIter<'a, S>,
980}
981
982impl<S: DocumentState> Iterator for ElementPrevSiblingsIter<'_, S> {
983    type Item = NodeId;
984
985    #[inline]
986    fn next(&mut self) -> Option<Self::Item> {
987        loop {
988            let id = self.inner.next()?;
989            if self.inner.doc.get(id).is_some_and(|n| n.kind.is_element()) {
990                return Some(id);
991            }
992        }
993    }
994}
995
996impl<'a, S: DocumentState> SiblingsIter<'a, S> {
997    /// Returns an iterator over only element siblings (skipping text/comment nodes).
998    #[must_use]
999    pub fn elements(self) -> ElementSiblingsIter<'a, S> {
1000        ElementSiblingsIter { inner: self }
1001    }
1002}
1003
1004/// Iterator over all element siblings (excluding self).
1005///
1006/// Created by [`SiblingsIter::elements`].
1007#[derive(Debug)]
1008pub struct ElementSiblingsIter<'a, S: DocumentState = Queryable> {
1009    inner: SiblingsIter<'a, S>,
1010}
1011
1012impl<S: DocumentState> Iterator for ElementSiblingsIter<'_, S> {
1013    type Item = NodeId;
1014
1015    #[inline]
1016    fn next(&mut self) -> Option<Self::Item> {
1017        loop {
1018            let id = self.inner.next()?;
1019            if self.inner.doc.get(id).is_some_and(|n| n.kind.is_element()) {
1020                return Some(id);
1021            }
1022        }
1023    }
1024}
1025
1026#[cfg(test)]
1027mod tests {
1028    use super::*;
1029
1030    fn create_test_doc() -> Document {
1031        let mut doc = Document::new();
1032
1033        let html = doc.create_element("html", HashMap::new());
1034        doc.set_root(html);
1035
1036        let head = doc.create_element("head", HashMap::new());
1037        let body = doc.create_element("body", HashMap::new());
1038        let div = doc.create_element("div", HashMap::new());
1039        let text = doc.create_text("Hello");
1040
1041        doc.append_child(html, head);
1042        doc.append_child(html, body);
1043        doc.append_child(body, div);
1044        doc.append_child(div, text);
1045
1046        doc
1047    }
1048
1049    #[test]
1050    fn document_create_element() {
1051        let mut doc = Document::new();
1052        let id = doc.create_element("div", HashMap::new());
1053        assert_eq!(doc.len(), 1);
1054
1055        let node = doc.get(id).unwrap();
1056        assert!(node.kind.is_element());
1057        assert_eq!(node.kind.tag_name(), Some("div"));
1058    }
1059
1060    #[test]
1061    fn document_create_text() {
1062        let mut doc = Document::new();
1063        let id = doc.create_text("Hello World");
1064
1065        let node = doc.get(id).unwrap();
1066        assert!(node.kind.is_text());
1067        assert_eq!(node.kind.as_text(), Some("Hello World"));
1068    }
1069
1070    #[test]
1071    fn document_root() {
1072        let mut doc = Document::new();
1073        assert!(doc.root().is_none());
1074
1075        let root_id = doc.create_element("html", HashMap::new());
1076        doc.set_root(root_id);
1077
1078        assert_eq!(doc.root(), Some(root_id));
1079    }
1080
1081    #[test]
1082    fn parent_navigation() {
1083        let doc = create_test_doc();
1084        let root = doc.root().unwrap();
1085
1086        let body = doc.children(root).nth(1).unwrap();
1087        assert_eq!(doc.parent(body), Some(root));
1088    }
1089
1090    #[test]
1091    fn children_iteration() {
1092        let doc = create_test_doc();
1093        let root = doc.root().unwrap();
1094
1095        assert_eq!(doc.children(root).count(), 2);
1096    }
1097
1098    #[test]
1099    fn sibling_navigation() {
1100        let doc = create_test_doc();
1101        let root = doc.root().unwrap();
1102
1103        let head = doc.first_child(root).unwrap();
1104        let body = doc.next_sibling(head).unwrap();
1105
1106        assert_eq!(doc.prev_sibling(body), Some(head));
1107        assert!(doc.next_sibling(body).is_none());
1108    }
1109
1110    #[test]
1111    fn descendants_iteration() {
1112        let doc = create_test_doc();
1113        let root = doc.root().unwrap();
1114
1115        assert_eq!(doc.descendants(root).count(), 4);
1116    }
1117
1118    #[test]
1119    fn ancestors_iteration() {
1120        let doc = create_test_doc();
1121        let root = doc.root().unwrap();
1122
1123        let body = doc.children(root).nth(1).unwrap();
1124        let div = doc.first_child(body).unwrap();
1125        let text = doc.first_child(div).unwrap();
1126
1127        assert_eq!(doc.ancestors(text).count(), 3);
1128    }
1129
1130    #[test]
1131    fn first_and_last_child() {
1132        let doc = create_test_doc();
1133        let root = doc.root().unwrap();
1134
1135        let first = doc.first_child(root).unwrap();
1136        let last = doc.last_child(root).unwrap();
1137
1138        assert_eq!(doc.get(first).unwrap().kind.tag_name(), Some("head"));
1139        assert_eq!(doc.get(last).unwrap().kind.tag_name(), Some("body"));
1140    }
1141
1142    #[test]
1143    fn children_empty_for_leaf_nodes() {
1144        let doc = create_test_doc();
1145        let root = doc.root().unwrap();
1146
1147        let body = doc.children(root).nth(1).unwrap();
1148        let div = doc.first_child(body).unwrap();
1149        let text = doc.first_child(div).unwrap();
1150
1151        assert!(doc.children(text).next().is_none());
1152    }
1153
1154    #[test]
1155    fn descendants_empty_for_leaf_nodes() {
1156        let doc = create_test_doc();
1157        let root = doc.root().unwrap();
1158
1159        let body = doc.children(root).nth(1).unwrap();
1160        let div = doc.first_child(body).unwrap();
1161        let text = doc.first_child(div).unwrap();
1162
1163        assert!(doc.descendants(text).next().is_none());
1164    }
1165
1166    #[test]
1167    fn ancestors_empty_for_root() {
1168        let doc = create_test_doc();
1169        let root = doc.root().unwrap();
1170
1171        assert!(doc.ancestors(root).next().is_none());
1172    }
1173
1174    #[test]
1175    fn document_parent_child_relationship() {
1176        let mut doc = Document::new();
1177        let parent_id = doc.create_element("div", HashMap::new());
1178        let child_id = doc.create_element("span", HashMap::new());
1179
1180        doc.append_child(parent_id, child_id);
1181
1182        let parent = doc.get(parent_id).unwrap();
1183        assert_eq!(parent.first_child, Some(child_id));
1184        assert_eq!(parent.last_child, Some(child_id));
1185
1186        let child = doc.get(child_id).unwrap();
1187        assert_eq!(child.parent, Some(parent_id));
1188    }
1189
1190    #[test]
1191    fn document_sibling_links() {
1192        let mut doc = Document::new();
1193        let parent_id = doc.create_element("div", HashMap::new());
1194        let child1_id = doc.create_element("span", HashMap::new());
1195        let child2_id = doc.create_element("span", HashMap::new());
1196        let child3_id = doc.create_element("span", HashMap::new());
1197
1198        doc.append_child(parent_id, child1_id);
1199        doc.append_child(parent_id, child2_id);
1200        doc.append_child(parent_id, child3_id);
1201
1202        let child1 = doc.get(child1_id).unwrap();
1203        assert_eq!(child1.prev_sibling, None);
1204        assert_eq!(child1.next_sibling, Some(child2_id));
1205
1206        let child2 = doc.get(child2_id).unwrap();
1207        assert_eq!(child2.prev_sibling, Some(child1_id));
1208        assert_eq!(child2.next_sibling, Some(child3_id));
1209
1210        let child3 = doc.get(child3_id).unwrap();
1211        assert_eq!(child3.prev_sibling, Some(child2_id));
1212        assert_eq!(child3.next_sibling, None);
1213    }
1214
1215    #[test]
1216    fn descendants_order_depth_first() {
1217        let mut doc = Document::new();
1218        let root = doc.create_element("root", HashMap::new());
1219        let a = doc.create_element("a", HashMap::new());
1220        let b = doc.create_element("b", HashMap::new());
1221        let a1 = doc.create_element("a1", HashMap::new());
1222        let a2 = doc.create_element("a2", HashMap::new());
1223
1224        doc.set_root(root);
1225        doc.append_child(root, a);
1226        doc.append_child(root, b);
1227        doc.append_child(a, a1);
1228        doc.append_child(a, a2);
1229
1230        let names: Vec<_> =
1231            doc.descendants(root).map(|id| doc.get(id).unwrap().kind.tag_name().unwrap()).collect();
1232
1233        // Depth-first pre-order: a -> a1 -> a2 -> b
1234        assert_eq!(names, vec!["a", "a1", "a2", "b"]);
1235    }
1236
1237    #[test]
1238    fn next_siblings_iteration() {
1239        let doc = create_test_doc();
1240        let root = doc.root().unwrap();
1241        let head = doc.first_child(root).unwrap();
1242
1243        assert_eq!(doc.next_siblings(head).count(), 1); // body
1244    }
1245
1246    #[test]
1247    fn next_siblings_empty_for_last() {
1248        let doc = create_test_doc();
1249        let root = doc.root().unwrap();
1250        let body = doc.last_child(root).unwrap();
1251
1252        assert!(doc.next_siblings(body).next().is_none());
1253    }
1254
1255    #[test]
1256    fn prev_siblings_iteration() {
1257        let doc = create_test_doc();
1258        let root = doc.root().unwrap();
1259        let body = doc.last_child(root).unwrap();
1260
1261        assert_eq!(doc.prev_siblings(body).count(), 1); // head
1262    }
1263
1264    #[test]
1265    fn prev_siblings_empty_for_first() {
1266        let doc = create_test_doc();
1267        let root = doc.root().unwrap();
1268        let head = doc.first_child(root).unwrap();
1269
1270        assert!(doc.prev_siblings(head).next().is_none());
1271    }
1272
1273    #[test]
1274    fn siblings_iteration() {
1275        let mut doc = Document::new();
1276        let parent = doc.create_element("ul", HashMap::new());
1277        let child1 = doc.create_element("li", HashMap::new());
1278        let child2 = doc.create_element("li", HashMap::new());
1279        let child3 = doc.create_element("li", HashMap::new());
1280
1281        doc.append_child(parent, child1);
1282        doc.append_child(parent, child2);
1283        doc.append_child(parent, child3);
1284
1285        let siblings: Vec<_> = doc.siblings(child2).collect();
1286        assert_eq!(siblings.len(), 2);
1287        assert_eq!(siblings[0], child1); // Document order
1288        assert_eq!(siblings[1], child3);
1289    }
1290
1291    #[test]
1292    fn siblings_empty_for_only_child() {
1293        let mut doc = Document::new();
1294        let parent = doc.create_element("div", HashMap::new());
1295        let child = doc.create_element("span", HashMap::new());
1296
1297        doc.append_child(parent, child);
1298
1299        assert!(doc.siblings(child).next().is_none());
1300    }
1301
1302    #[test]
1303    fn siblings_excludes_self() {
1304        let mut doc = Document::new();
1305        let parent = doc.create_element("ul", HashMap::new());
1306        let child1 = doc.create_element("li", HashMap::new());
1307        let child2 = doc.create_element("li", HashMap::new());
1308
1309        doc.append_child(parent, child1);
1310        doc.append_child(parent, child2);
1311
1312        let siblings1: Vec<_> = doc.siblings(child1).collect();
1313        assert_eq!(siblings1.len(), 1);
1314        assert_eq!(siblings1[0], child2);
1315
1316        let siblings2: Vec<_> = doc.siblings(child2).collect();
1317        assert_eq!(siblings2.len(), 1);
1318        assert_eq!(siblings2[0], child1);
1319    }
1320
1321    #[test]
1322    fn test_children_elements() {
1323        let mut doc = Document::new();
1324        let parent = doc.create_element("div", HashMap::new());
1325        let text = doc.create_text("text");
1326        let child1 = doc.create_element("span", HashMap::new());
1327        let child2 = doc.create_element("p", HashMap::new());
1328
1329        doc.append_child(parent, text);
1330        doc.append_child(parent, child1);
1331        doc.append_child(parent, child2);
1332
1333        let mut iter = doc.children(parent).elements();
1334        assert_eq!(iter.next(), Some(child1));
1335        assert_eq!(iter.next(), Some(child2));
1336        assert_eq!(iter.next(), None);
1337    }
1338
1339    #[test]
1340    fn test_descendants_elements() {
1341        let mut doc = Document::new();
1342        let root = doc.create_element("div", HashMap::new());
1343        let text = doc.create_text("text");
1344        let child = doc.create_element("span", HashMap::new());
1345        let grandchild = doc.create_element("b", HashMap::new());
1346
1347        doc.append_child(root, text);
1348        doc.append_child(root, child);
1349        doc.append_child(child, grandchild);
1350
1351        assert_eq!(doc.descendants(root).elements().count(), 2); // span, b
1352    }
1353
1354    #[test]
1355    fn test_ancestors_elements() {
1356        let mut doc = Document::new();
1357        let root = doc.create_element("html", HashMap::new());
1358        let body = doc.create_element("body", HashMap::new());
1359        let div = doc.create_element("div", HashMap::new());
1360
1361        doc.set_root(root);
1362        doc.append_child(root, body);
1363        doc.append_child(body, div);
1364
1365        assert_eq!(doc.ancestors(div).elements().count(), 2);
1366    }
1367
1368    #[test]
1369    fn test_next_siblings_elements() {
1370        let mut doc = Document::new();
1371        let parent = doc.create_element("ul", HashMap::new());
1372        let li1 = doc.create_element("li", HashMap::new());
1373        let text = doc.create_text(" ");
1374        let li2 = doc.create_element("li", HashMap::new());
1375
1376        doc.append_child(parent, li1);
1377        doc.append_child(parent, text);
1378        doc.append_child(parent, li2);
1379
1380        let siblings: Vec<_> = doc.next_siblings(li1).elements().collect();
1381        assert_eq!(siblings.len(), 1);
1382        assert_eq!(siblings[0], li2);
1383    }
1384
1385    #[test]
1386    fn test_prev_siblings_elements() {
1387        let mut doc = Document::new();
1388        let parent = doc.create_element("ul", HashMap::new());
1389        let li1 = doc.create_element("li", HashMap::new());
1390        let text = doc.create_text(" ");
1391        let li2 = doc.create_element("li", HashMap::new());
1392
1393        doc.append_child(parent, li1);
1394        doc.append_child(parent, text);
1395        doc.append_child(parent, li2);
1396
1397        let siblings: Vec<_> = doc.prev_siblings(li2).elements().collect();
1398        assert_eq!(siblings.len(), 1);
1399        assert_eq!(siblings[0], li1);
1400    }
1401
1402    #[test]
1403    fn test_siblings_elements() {
1404        let mut doc = Document::new();
1405        let parent = doc.create_element("ul", HashMap::new());
1406        let li1 = doc.create_element("li", HashMap::new());
1407        let text = doc.create_text(" ");
1408        let li2 = doc.create_element("li", HashMap::new());
1409        let li3 = doc.create_element("li", HashMap::new());
1410
1411        doc.append_child(parent, li1);
1412        doc.append_child(parent, text);
1413        doc.append_child(parent, li2);
1414        doc.append_child(parent, li3);
1415
1416        assert_eq!(doc.siblings(li2).elements().count(), 2); // li1, li3
1417    }
1418}