scrape_core/dom/
document.rs

1//! Document container and tree operations.
2
3use std::collections::HashMap;
4
5use super::{
6    arena::Arena,
7    node::{Node, NodeId},
8};
9
10/// An HTML document containing a tree of nodes.
11///
12/// The document owns all nodes via an arena allocator, ensuring
13/// cache-friendly contiguous storage. Navigation is performed
14/// using [`NodeId`] handles.
15///
16/// # Architecture
17///
18/// Nodes are stored in a single contiguous `Arena<Node>`. Parent-child
19/// relationships use `first_child`/`last_child` links (O(1) append),
20/// and siblings are doubly linked via `prev_sibling`/`next_sibling`.
21///
22/// # Navigation
23///
24/// The document provides both direct navigation methods and lazy iterators:
25///
26/// - [`parent`](Document::parent), [`first_child`](Document::first_child),
27///   [`last_child`](Document::last_child) - direct links
28/// - [`children`](Document::children) - iterate over direct children
29/// - [`ancestors`](Document::ancestors) - iterate from parent to root
30/// - [`descendants`](Document::descendants) - depth-first subtree traversal
31#[derive(Debug)]
32pub struct Document {
33    arena: Arena<Node>,
34    root: Option<NodeId>,
35}
36
37impl Default for Document {
38    fn default() -> Self {
39        Self::new()
40    }
41}
42
43impl Document {
44    /// Creates a new empty document with default capacity.
45    ///
46    /// The default capacity is 256 nodes, which is sufficient for typical HTML pages
47    /// and reduces reallocations during parsing.
48    #[must_use]
49    pub fn new() -> Self {
50        Self::with_capacity(256)
51    }
52
53    /// Creates a new empty document with the specified capacity.
54    ///
55    /// Use this when you know the approximate number of nodes to avoid reallocations.
56    #[must_use]
57    pub fn with_capacity(capacity: usize) -> Self {
58        Self { arena: Arena::with_capacity(capacity), root: None }
59    }
60
61    /// Returns the root node ID, if any.
62    #[must_use]
63    pub fn root(&self) -> Option<NodeId> {
64        self.root
65    }
66
67    /// Sets the root node ID.
68    pub fn set_root(&mut self, id: NodeId) {
69        self.root = Some(id);
70    }
71
72    /// Returns a reference to the node with the given ID.
73    #[must_use]
74    pub fn get(&self, id: NodeId) -> Option<&Node> {
75        self.arena.get(id.index())
76    }
77
78    /// Returns a mutable reference to the node with the given ID.
79    #[must_use]
80    pub fn get_mut(&mut self, id: NodeId) -> Option<&mut Node> {
81        self.arena.get_mut(id.index())
82    }
83
84    /// Creates a new element node and returns its ID.
85    pub fn create_element(
86        &mut self,
87        name: impl Into<String>,
88        attributes: HashMap<String, String>,
89    ) -> NodeId {
90        NodeId::new(self.arena.alloc(Node::element(name, attributes)))
91    }
92
93    /// Creates a new text node and returns its ID.
94    pub fn create_text(&mut self, content: impl Into<String>) -> NodeId {
95        NodeId::new(self.arena.alloc(Node::text(content)))
96    }
97
98    /// Creates a new comment node and returns its ID.
99    pub fn create_comment(&mut self, content: impl Into<String>) -> NodeId {
100        NodeId::new(self.arena.alloc(Node::comment(content)))
101    }
102
103    /// Appends a child node to a parent.
104    ///
105    /// Updates parent, `first_child`, `last_child`, and sibling links.
106    ///
107    /// # Panics
108    ///
109    /// Panics in debug builds if `parent_id` or `child_id` are invalid.
110    pub fn append_child(&mut self, parent_id: NodeId, child_id: NodeId) {
111        debug_assert!(parent_id.index() < self.arena.len(), "Invalid parent_id");
112        debug_assert!(child_id.index() < self.arena.len(), "Invalid child_id");
113
114        // Get current last child of parent
115        let prev_last = self.arena.get(parent_id.index()).and_then(|p| p.last_child);
116
117        // Update child's parent and prev_sibling
118        if let Some(child) = self.arena.get_mut(child_id.index()) {
119            child.parent = Some(parent_id);
120            child.prev_sibling = prev_last;
121            child.next_sibling = None;
122        }
123
124        // Update previous last child's next_sibling
125        if let Some(prev_id) = prev_last
126            && let Some(prev) = self.arena.get_mut(prev_id.index())
127        {
128            prev.next_sibling = Some(child_id);
129        }
130
131        // Update parent's first_child (if first) and last_child
132        if let Some(parent) = self.arena.get_mut(parent_id.index()) {
133            if parent.first_child.is_none() {
134                parent.first_child = Some(child_id);
135            }
136            parent.last_child = Some(child_id);
137        }
138    }
139
140    /// Returns the number of nodes in the document.
141    #[must_use]
142    pub fn len(&self) -> usize {
143        self.arena.len()
144    }
145
146    /// Returns `true` if the document has no nodes.
147    #[must_use]
148    pub fn is_empty(&self) -> bool {
149        self.arena.is_empty()
150    }
151
152    /// Returns an iterator over all nodes.
153    pub fn nodes(&self) -> impl Iterator<Item = (NodeId, &Node)> {
154        self.arena.iter().map(|(i, node)| (NodeId::new(i), node))
155    }
156
157    // ==================== Navigation APIs ====================
158
159    /// Returns the parent of a node.
160    #[must_use]
161    pub fn parent(&self, id: NodeId) -> Option<NodeId> {
162        self.arena.get(id.index()).and_then(|n| n.parent)
163    }
164
165    /// Returns the first child of a node.
166    #[must_use]
167    pub fn first_child(&self, id: NodeId) -> Option<NodeId> {
168        self.arena.get(id.index()).and_then(|n| n.first_child)
169    }
170
171    /// Returns the last child of a node.
172    #[must_use]
173    pub fn last_child(&self, id: NodeId) -> Option<NodeId> {
174        self.arena.get(id.index()).and_then(|n| n.last_child)
175    }
176
177    /// Returns the next sibling of a node.
178    #[must_use]
179    pub fn next_sibling(&self, id: NodeId) -> Option<NodeId> {
180        self.arena.get(id.index()).and_then(|n| n.next_sibling)
181    }
182
183    /// Returns the previous sibling of a node.
184    #[must_use]
185    pub fn prev_sibling(&self, id: NodeId) -> Option<NodeId> {
186        self.arena.get(id.index()).and_then(|n| n.prev_sibling)
187    }
188
189    /// Returns an iterator over children of a node.
190    ///
191    /// The iterator yields children in order from first to last.
192    ///
193    /// # Examples
194    ///
195    /// ```
196    /// use std::collections::HashMap;
197    ///
198    /// use scrape_core::dom::Document;
199    ///
200    /// let mut doc = Document::new();
201    /// let parent = doc.create_element("div", HashMap::new());
202    /// let child1 = doc.create_element("span", HashMap::new());
203    /// let child2 = doc.create_element("span", HashMap::new());
204    ///
205    /// doc.append_child(parent, child1);
206    /// doc.append_child(parent, child2);
207    ///
208    /// let children: Vec<_> = doc.children(parent).collect();
209    /// assert_eq!(children.len(), 2);
210    /// ```
211    #[must_use]
212    pub fn children(&self, id: NodeId) -> ChildrenIter<'_> {
213        ChildrenIter { doc: self, current: self.first_child(id) }
214    }
215
216    /// Returns an iterator over ancestors of a node.
217    ///
218    /// The iterator yields ancestors from parent to root (does not include the node itself).
219    ///
220    /// # Examples
221    ///
222    /// ```
223    /// use std::collections::HashMap;
224    ///
225    /// use scrape_core::dom::Document;
226    ///
227    /// let mut doc = Document::new();
228    /// let grandparent = doc.create_element("html", HashMap::new());
229    /// let parent = doc.create_element("body", HashMap::new());
230    /// let child = doc.create_element("div", HashMap::new());
231    ///
232    /// doc.append_child(grandparent, parent);
233    /// doc.append_child(parent, child);
234    ///
235    /// let ancestors: Vec<_> = doc.ancestors(child).collect();
236    /// assert_eq!(ancestors.len(), 2); // parent, grandparent
237    /// ```
238    #[must_use]
239    pub fn ancestors(&self, id: NodeId) -> AncestorsIter<'_> {
240        AncestorsIter { doc: self, current: self.parent(id) }
241    }
242
243    /// Returns an iterator over descendants in depth-first pre-order.
244    ///
245    /// Does not include the starting node itself.
246    ///
247    /// # Examples
248    ///
249    /// ```
250    /// use std::collections::HashMap;
251    ///
252    /// use scrape_core::dom::Document;
253    ///
254    /// let mut doc = Document::new();
255    /// let root = doc.create_element("html", HashMap::new());
256    /// let child1 = doc.create_element("head", HashMap::new());
257    /// let child2 = doc.create_element("body", HashMap::new());
258    /// let grandchild = doc.create_element("div", HashMap::new());
259    ///
260    /// doc.append_child(root, child1);
261    /// doc.append_child(root, child2);
262    /// doc.append_child(child2, grandchild);
263    ///
264    /// let descendants: Vec<_> = doc.descendants(root).collect();
265    /// assert_eq!(descendants.len(), 3); // head, body, div
266    /// ```
267    #[must_use]
268    pub fn descendants(&self, id: NodeId) -> DescendantsIter<'_> {
269        DescendantsIter { doc: self, root: id, stack: vec![id], started: false }
270    }
271}
272
273/// Iterator over direct children of a node.
274///
275/// Created by [`Document::children`].
276#[derive(Debug)]
277pub struct ChildrenIter<'a> {
278    doc: &'a Document,
279    current: Option<NodeId>,
280}
281
282impl Iterator for ChildrenIter<'_> {
283    type Item = NodeId;
284
285    fn next(&mut self) -> Option<Self::Item> {
286        let current = self.current?;
287        self.current = self.doc.next_sibling(current);
288        Some(current)
289    }
290}
291
292/// Iterator over ancestors of a node (parent, grandparent, ...).
293///
294/// Created by [`Document::ancestors`].
295#[derive(Debug)]
296pub struct AncestorsIter<'a> {
297    doc: &'a Document,
298    current: Option<NodeId>,
299}
300
301impl Iterator for AncestorsIter<'_> {
302    type Item = NodeId;
303
304    fn next(&mut self) -> Option<Self::Item> {
305        let current = self.current?;
306        self.current = self.doc.parent(current);
307        Some(current)
308    }
309}
310
311/// Iterator over descendants in depth-first pre-order.
312///
313/// Created by [`Document::descendants`].
314#[derive(Debug)]
315pub struct DescendantsIter<'a> {
316    doc: &'a Document,
317    root: NodeId,
318    stack: Vec<NodeId>,
319    started: bool,
320}
321
322impl Iterator for DescendantsIter<'_> {
323    type Item = NodeId;
324
325    fn next(&mut self) -> Option<Self::Item> {
326        if !self.started {
327            self.started = true;
328            if let Some(first) = self.doc.first_child(self.root) {
329                self.stack.clear();
330                self.stack.push(first);
331            } else {
332                return None;
333            }
334        }
335
336        let current = self.stack.pop()?;
337
338        // Push next sibling first (so it's processed after children)
339        if let Some(next) = self.doc.next_sibling(current) {
340            self.stack.push(next);
341        }
342
343        // Push first child (will be processed next - depth-first)
344        if let Some(child) = self.doc.first_child(current) {
345            self.stack.push(child);
346        }
347
348        Some(current)
349    }
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    fn create_test_doc() -> Document {
357        let mut doc = Document::new();
358
359        let html = doc.create_element("html", HashMap::new());
360        doc.set_root(html);
361
362        let head = doc.create_element("head", HashMap::new());
363        let body = doc.create_element("body", HashMap::new());
364        let div = doc.create_element("div", HashMap::new());
365        let text = doc.create_text("Hello");
366
367        doc.append_child(html, head);
368        doc.append_child(html, body);
369        doc.append_child(body, div);
370        doc.append_child(div, text);
371
372        doc
373    }
374
375    #[test]
376    fn document_create_element() {
377        let mut doc = Document::new();
378        let id = doc.create_element("div", HashMap::new());
379        assert_eq!(doc.len(), 1);
380
381        let node = doc.get(id).unwrap();
382        assert!(node.kind.is_element());
383        assert_eq!(node.kind.tag_name(), Some("div"));
384    }
385
386    #[test]
387    fn document_create_text() {
388        let mut doc = Document::new();
389        let id = doc.create_text("Hello World");
390
391        let node = doc.get(id).unwrap();
392        assert!(node.kind.is_text());
393        assert_eq!(node.kind.as_text(), Some("Hello World"));
394    }
395
396    #[test]
397    fn document_root() {
398        let mut doc = Document::new();
399        assert!(doc.root().is_none());
400
401        let root_id = doc.create_element("html", HashMap::new());
402        doc.set_root(root_id);
403
404        assert_eq!(doc.root(), Some(root_id));
405    }
406
407    #[test]
408    fn parent_navigation() {
409        let doc = create_test_doc();
410        let root = doc.root().unwrap();
411
412        let body = doc.children(root).nth(1).unwrap();
413        assert_eq!(doc.parent(body), Some(root));
414    }
415
416    #[test]
417    fn children_iteration() {
418        let doc = create_test_doc();
419        let root = doc.root().unwrap();
420
421        assert_eq!(doc.children(root).count(), 2);
422    }
423
424    #[test]
425    fn sibling_navigation() {
426        let doc = create_test_doc();
427        let root = doc.root().unwrap();
428
429        let head = doc.first_child(root).unwrap();
430        let body = doc.next_sibling(head).unwrap();
431
432        assert_eq!(doc.prev_sibling(body), Some(head));
433        assert!(doc.next_sibling(body).is_none());
434    }
435
436    #[test]
437    fn descendants_iteration() {
438        let doc = create_test_doc();
439        let root = doc.root().unwrap();
440
441        assert_eq!(doc.descendants(root).count(), 4);
442    }
443
444    #[test]
445    fn ancestors_iteration() {
446        let doc = create_test_doc();
447        let root = doc.root().unwrap();
448
449        let body = doc.children(root).nth(1).unwrap();
450        let div = doc.first_child(body).unwrap();
451        let text = doc.first_child(div).unwrap();
452
453        assert_eq!(doc.ancestors(text).count(), 3);
454    }
455
456    #[test]
457    fn first_and_last_child() {
458        let doc = create_test_doc();
459        let root = doc.root().unwrap();
460
461        let first = doc.first_child(root).unwrap();
462        let last = doc.last_child(root).unwrap();
463
464        assert_eq!(doc.get(first).unwrap().kind.tag_name(), Some("head"));
465        assert_eq!(doc.get(last).unwrap().kind.tag_name(), Some("body"));
466    }
467
468    #[test]
469    fn children_empty_for_leaf_nodes() {
470        let doc = create_test_doc();
471        let root = doc.root().unwrap();
472
473        let body = doc.children(root).nth(1).unwrap();
474        let div = doc.first_child(body).unwrap();
475        let text = doc.first_child(div).unwrap();
476
477        assert!(doc.children(text).next().is_none());
478    }
479
480    #[test]
481    fn descendants_empty_for_leaf_nodes() {
482        let doc = create_test_doc();
483        let root = doc.root().unwrap();
484
485        let body = doc.children(root).nth(1).unwrap();
486        let div = doc.first_child(body).unwrap();
487        let text = doc.first_child(div).unwrap();
488
489        assert!(doc.descendants(text).next().is_none());
490    }
491
492    #[test]
493    fn ancestors_empty_for_root() {
494        let doc = create_test_doc();
495        let root = doc.root().unwrap();
496
497        assert!(doc.ancestors(root).next().is_none());
498    }
499
500    #[test]
501    fn document_parent_child_relationship() {
502        let mut doc = Document::new();
503        let parent_id = doc.create_element("div", HashMap::new());
504        let child_id = doc.create_element("span", HashMap::new());
505
506        doc.append_child(parent_id, child_id);
507
508        let parent = doc.get(parent_id).unwrap();
509        assert_eq!(parent.first_child, Some(child_id));
510        assert_eq!(parent.last_child, Some(child_id));
511
512        let child = doc.get(child_id).unwrap();
513        assert_eq!(child.parent, Some(parent_id));
514    }
515
516    #[test]
517    fn document_sibling_links() {
518        let mut doc = Document::new();
519        let parent_id = doc.create_element("div", HashMap::new());
520        let child1_id = doc.create_element("span", HashMap::new());
521        let child2_id = doc.create_element("span", HashMap::new());
522        let child3_id = doc.create_element("span", HashMap::new());
523
524        doc.append_child(parent_id, child1_id);
525        doc.append_child(parent_id, child2_id);
526        doc.append_child(parent_id, child3_id);
527
528        let child1 = doc.get(child1_id).unwrap();
529        assert_eq!(child1.prev_sibling, None);
530        assert_eq!(child1.next_sibling, Some(child2_id));
531
532        let child2 = doc.get(child2_id).unwrap();
533        assert_eq!(child2.prev_sibling, Some(child1_id));
534        assert_eq!(child2.next_sibling, Some(child3_id));
535
536        let child3 = doc.get(child3_id).unwrap();
537        assert_eq!(child3.prev_sibling, Some(child2_id));
538        assert_eq!(child3.next_sibling, None);
539    }
540
541    #[test]
542    fn descendants_order_depth_first() {
543        let mut doc = Document::new();
544        let root = doc.create_element("root", HashMap::new());
545        let a = doc.create_element("a", HashMap::new());
546        let b = doc.create_element("b", HashMap::new());
547        let a1 = doc.create_element("a1", HashMap::new());
548        let a2 = doc.create_element("a2", HashMap::new());
549
550        doc.set_root(root);
551        doc.append_child(root, a);
552        doc.append_child(root, b);
553        doc.append_child(a, a1);
554        doc.append_child(a, a2);
555
556        let names: Vec<_> =
557            doc.descendants(root).map(|id| doc.get(id).unwrap().kind.tag_name().unwrap()).collect();
558
559        // Depth-first pre-order: a -> a1 -> a2 -> b
560        assert_eq!(names, vec!["a", "a1", "a2", "b"]);
561    }
562}