Skip to main content

dom_query/
document.rs

1use std::borrow::Cow;
2use std::cell::{Cell, Ref, RefCell};
3
4#[allow(unused_imports)]
5use html5ever::namespace_url;
6use html5ever::parse_document;
7use html5ever::tree_builder;
8use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
9use html5ever::ParseOpts;
10use html5ever::{local_name, ns};
11use html5ever::{Attribute, QualName};
12
13use tendril::{StrTendril, TendrilSink};
14
15use crate::dom_tree::Tree;
16use crate::entities::wrap_tendril;
17use crate::matcher::{DescendantMatches, Matcher};
18use crate::node::{Element, NodeData, NodeId, NodeRef, TreeNode};
19use crate::selection::Selection;
20/// Document represents an HTML document to be manipulated.
21#[derive(Clone)]
22pub struct Document {
23    /// The document's dom tree.
24    pub tree: Tree,
25
26    /// Errors that occurred during parsing.
27    pub errors: RefCell<Vec<Cow<'static, str>>>,
28
29    /// The document's quirks mode.
30    pub quirks_mode: Cell<QuirksMode>,
31}
32
33impl Default for Document {
34    fn default() -> Self {
35        Self {
36            tree: Tree::new(NodeData::Document),
37            errors: RefCell::new(vec![]),
38            quirks_mode: Cell::new(tree_builder::NoQuirks),
39        }
40    }
41}
42
43impl<T: Into<StrTendril>> From<T> for Document {
44    fn from(html: T) -> Self {
45        let opts = ParseOpts {
46            tokenizer: Default::default(),
47            tree_builder: tree_builder::TreeBuilderOpts {
48                scripting_enabled: false,
49                ..Default::default()
50            },
51        };
52        parse_document(Document::default(), opts).one(html)
53    }
54}
55
56// fragment
57impl Document {
58    /// Creates a new HTML document fragment.
59    pub fn fragment<T: Into<StrTendril>>(html: T) -> Self {
60        // Note: The `body` context element is somehow ignored during parsing,
61        // so the `html` element becomes the first child of the root node,
62        // rather than being nested inside a `body` element as expected.
63        html5ever::parse_fragment(
64            Document::fragment_sink(),
65            ParseOpts {
66                tokenizer: Default::default(),
67                tree_builder: tree_builder::TreeBuilderOpts {
68                    scripting_enabled: false,
69                    drop_doctype: true,
70                    ..Default::default()
71                },
72            },
73            QualName::new(None, ns!(html), local_name!("body")),
74            Vec::new(),
75            false,
76        )
77        .one(html)
78    }
79    /// Create a new sink for a html document fragment
80    pub fn fragment_sink() -> Self {
81        Self {
82            tree: Tree::new(NodeData::Fragment),
83            errors: RefCell::new(vec![]),
84            quirks_mode: Cell::new(tree_builder::NoQuirks),
85        }
86    }
87}
88
89// property methods
90impl Document {
91    /// Return the underlying root document node.
92    #[inline]
93    pub fn root(&self) -> NodeRef<'_> {
94        self.tree.root()
95    }
96
97    /// Returns the root element node (`<html>`) of the document.
98    pub fn html_root(&self) -> NodeRef<'_> {
99        self.tree.html_root()
100    }
101
102    /// Gets the HTML contents of the document. It includes
103    /// the text and comment nodes.
104    pub fn html(&self) -> StrTendril {
105        self.root().html()
106    }
107
108    /// Gets the HTML contents of the document.
109    /// It includes only children nodes.
110    pub fn inner_html(&self) -> StrTendril {
111        self.root().inner_html()
112    }
113
114    /// Gets the HTML contents of the document.
115    /// It includes its children nodes.
116    pub fn try_html(&self) -> Option<StrTendril> {
117        self.root().try_html()
118    }
119
120    /// Gets the HTML contents of the document.
121    /// It includes only children nodes.
122    pub fn try_inner_html(&self) -> Option<StrTendril> {
123        self.root().try_inner_html()
124    }
125
126    /// Gets the text content of the document.
127    pub fn text(&self) -> StrTendril {
128        self.root().text()
129    }
130
131    /// Returns the formatted text of the document and its descendants. This is the same as
132    /// the `text()` method, but with a few differences:
133    ///
134    /// - Whitespace is normalized so that there is only one space between words.
135    /// - All whitespace is removed from the beginning and end of the text.
136    /// - After block elements, a double newline is added.
137    /// - For elements like `br`, 'hr', 'li', 'tr' a single newline is added.
138    pub fn formatted_text(&self) -> StrTendril {
139        self.root().formatted_text()
140    }
141
142    /// Finds the base URI of the tree by looking for `<base>` tags in document's head.
143    ///
144    /// The base URI is the value of the `href` attribute of the first
145    /// `<base>` tag in the document's head. If no such tag is found,
146    /// the method returns `None`.
147    ///
148    pub fn base_uri(&self) -> Option<StrTendril> {
149        self.tree.base_uri()
150    }
151
152    /// Returns the document's `<body>` element, or `None` if absent.
153    /// For fragments ([crate::NodeData::Fragment]), this typically returns `None`.
154    pub fn body(&self) -> Option<NodeRef<'_>> {
155        self.tree.body()
156    }
157
158    /// Returns the document's `<head>` element, or `None` if absent.
159    /// For fragments ([crate::NodeData::Fragment]), this typically returns `None`.
160    pub fn head(&self) -> Option<NodeRef<'_>> {
161        self.tree.head()
162    }
163
164    /// Merges adjacent text nodes and removes empty text nodes.
165    ///
166    /// Normalization is necessary to ensure that adjacent text nodes are merged into one text node.
167    ///
168    /// # Example
169    ///
170    /// ```
171    /// use dom_query::Document;
172    ///
173    /// let doc = Document::from("<div>Hello</div>");
174    /// let sel = doc.select("div");
175    /// let div = sel.nodes().first().unwrap();
176    /// let text1 = doc.tree.new_text(" ");
177    /// let text2 = doc.tree.new_text("World");
178    /// let text3 = doc.tree.new_text("");
179    /// div.append_child(&text1);
180    /// div.append_child(&text2);
181    /// div.append_child(&text3);
182    /// assert_eq!(div.children().len(), 4);
183    /// doc.normalize();
184    /// assert_eq!(div.children().len(), 1);
185    /// assert_eq!(&div.text(), "Hello World");
186    /// ```
187    pub fn normalize(&self) {
188        self.root().normalize();
189    }
190}
191
192// traversal methods
193impl Document {
194    /// Gets the descendants of the root document node in the current, filter by a selector.
195    /// It returns a new selection object containing these matched elements.
196    ///
197    /// # Panics
198    ///
199    /// Panics if failed to parse the given CSS selector.
200    pub fn select(&self, sel: &str) -> Selection<'_> {
201        let matcher = Matcher::new(sel).expect("Invalid CSS selector");
202        self.select_matcher(&matcher)
203    }
204
205    /// Alias for `select`, it gets the descendants of the root document node in the current, filter by a selector.
206    /// It returns a new selection object containing these matched elements.
207    ///
208    /// # Panics
209    ///
210    /// Panics if failed to parse the given CSS selector.
211    pub fn nip(&self, sel: &str) -> Selection<'_> {
212        self.select(sel)
213    }
214
215    /// Gets the descendants of the root document node in the current, filter by a selector.
216    /// It returns a new selection object containing these matched elements.
217    pub fn try_select(&self, sel: &str) -> Option<Selection<'_>> {
218        Matcher::new(sel).ok().and_then(|matcher| {
219            let selection = self.select_matcher(&matcher);
220            if !selection.is_empty() {
221                Some(selection)
222            } else {
223                None
224            }
225        })
226    }
227
228    /// Gets the descendants of the root document node in the current, filter by a matcher.
229    /// It returns a new selection object containing these matched elements.
230    pub fn select_matcher(&self, matcher: &Matcher) -> Selection<'_> {
231        let root = self.tree.root();
232        let nodes = DescendantMatches::new(root, matcher).collect();
233
234        Selection { nodes }
235    }
236
237    /// Gets the descendants of the root document node in the current, filter by a matcher.
238    /// It returns a new selection object containing elements of the single (first) match.    
239    pub fn select_single_matcher(&self, matcher: &Matcher) -> Selection<'_> {
240        let node = DescendantMatches::new(self.tree.root(), matcher).next();
241
242        match node {
243            Some(node) => node.into(),
244            None => Default::default(),
245        }
246    }
247
248    /// Gets the descendants of the root document node in the current, filter by a selector.
249    /// It returns a new selection object containing elements of the single (first) match.
250    ///
251    /// # Panics
252    ///
253    /// Panics if failed to parse the given CSS selector.
254    pub fn select_single(&self, sel: &str) -> Selection<'_> {
255        let matcher = Matcher::new(sel).expect("Invalid CSS selector");
256        self.select_single_matcher(&matcher)
257    }
258}
259
260impl TreeSink for Document {
261    type ElemName<'a> = Ref<'a, QualName>;
262    /// The overall result of parsing.
263    type Output = Self;
264    /// Handle is a reference to a DOM node. The tree builder requires that a `Handle` implements `Clone` to get
265    /// another reference to the same node.
266    type Handle = NodeId;
267
268    /// Consume this sink and return the overall result of parsing.
269    #[inline]
270    fn finish(self) -> Self {
271        self
272    }
273
274    /// Signal a parse error.
275    #[inline]
276    fn parse_error(&self, msg: Cow<'static, str>) {
277        let mut errors = self.errors.borrow_mut();
278        errors.push(msg);
279    }
280
281    /// Get a handle to the `Document` node.
282    #[inline]
283    fn get_document(&self) -> Self::Handle {
284        self.tree.root_id()
285    }
286
287    /// Get a handle to a template's template contents. The tree builder promises this will never be called with
288    /// something else than a template element.
289    #[inline]
290    fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
291        self.tree
292            .query_node_or(target, None, |node| {
293                node.as_element().and_then(|elem| elem.template_contents)
294            })
295            .expect("target node is not a template element!")
296    }
297
298    /// Set the document's quirks mode.
299    #[inline]
300    fn set_quirks_mode(&self, mode: QuirksMode) {
301        self.quirks_mode.set(mode);
302    }
303
304    /// Do two handles refer to the same node?.
305    #[inline]
306    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
307        *x == *y
308    }
309
310    /// What is the name of the element?
311    /// Should never be called on a non-element node; Feel free to `panic!`.
312    #[inline]
313    fn elem_name(&self, target: &Self::Handle) -> Self::ElemName<'_> {
314        self.tree
315            .get_name(target)
316            .expect("target node is not an element!")
317    }
318
319    /// Create an element.
320    /// When creating a template element (`name.ns.expanded() == expanded_name!(html"template")`), an
321    /// associated document fragment called the "template contents" should also be created. Later calls to
322    /// self.get_template_contents() with that given element return it. See `the template element in the whatwg spec`,
323    #[inline]
324    fn create_element(
325        &self,
326        name: QualName,
327        attrs: Vec<Attribute>,
328        flags: ElementFlags,
329    ) -> Self::Handle {
330        let mut nodes = self.tree.nodes.borrow_mut();
331        let new_elem_id = NodeId::new(nodes.len());
332        let template_contents = if flags.template {
333            Some(NodeId::new(nodes.len() + 1))
334        } else {
335            None
336        };
337
338        let data = NodeData::Element(Element::new(
339            name,
340            attrs,
341            template_contents,
342            flags.mathml_annotation_xml_integration_point,
343        ));
344
345        nodes.push(TreeNode::new(new_elem_id, data));
346
347        if let Some(fragment_id) = template_contents {
348            nodes.push(TreeNode::new(fragment_id, NodeData::Fragment));
349            // The template's content is considered outside of the main document,
350            // so its DocumentFragment remains parentless.
351        }
352
353        new_elem_id
354    }
355
356    /// Create a comment node.
357    #[inline]
358    fn create_comment(&self, text: StrTendril) -> Self::Handle {
359        self.tree.create_node(NodeData::Comment {
360            contents: wrap_tendril(text),
361        })
362    }
363
364    /// Create a Processing Instruction node.
365    #[inline]
366    fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle {
367        self.tree.create_node(NodeData::ProcessingInstruction {
368            target: wrap_tendril(target),
369            contents: wrap_tendril(data),
370        })
371    }
372
373    /// Append a node as the last child of the given node. If this would produce adjacent sibling text nodes, it
374    /// should concatenate the text instead.
375    /// The child node will not already have a parent.
376    fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
377        // Append to an existing Text node if we have one.
378
379        match child {
380            NodeOrText::AppendNode(node_id) => self.tree.append_child_of(parent, &node_id),
381            NodeOrText::AppendText(text) => {
382                let last_child = self.tree.last_child_of(parent);
383                let merged = last_child
384                    .map(|child| append_to_existing_text(&child, &text))
385                    .unwrap_or(false);
386
387                if merged {
388                    return;
389                }
390
391                self.tree.append_child_data_of(
392                    parent,
393                    NodeData::Text {
394                        contents: wrap_tendril(text),
395                    },
396                )
397            }
398        }
399    }
400
401    /// Append a node as the sibling immediately before the given node.
402    /// The tree builder promises that `sibling` is not a text node. However its old previous sibling, which would
403    /// become the new node's previous sibling, could be a text node. If the new node is also a text node, the two
404    /// should be merged, as in the behavior of `append`.
405    fn append_before_sibling(&self, sibling: &Self::Handle, child: NodeOrText<Self::Handle>) {
406        match child {
407            NodeOrText::AppendText(text) => {
408                let prev_sibling = self.tree.prev_sibling_of(sibling);
409                let merged = prev_sibling
410                    .map(|sibling| append_to_existing_text(&sibling, &text))
411                    .unwrap_or(false);
412
413                if merged {
414                    return;
415                }
416
417                let id = self.tree.create_node(NodeData::Text {
418                    contents: wrap_tendril(text),
419                });
420                self.tree.insert_before_of(sibling, &id);
421            }
422
423            // The tree builder promises we won't have a text node after
424            // the insertion point.
425
426            // Any other kind of node.
427            NodeOrText::AppendNode(id) => self.tree.insert_before_of(sibling, &id),
428        };
429    }
430
431    /// When the insertion point is decided by the existence of a parent node of the element, we consider both
432    /// possibilities and send the element which will be used if a parent node exists, along with the element to be
433    /// used if there isn't one.
434    fn append_based_on_parent_node(
435        &self,
436        element: &Self::Handle,
437        prev_element: &Self::Handle,
438        child: NodeOrText<Self::Handle>,
439    ) {
440        let has_parent = self
441            .tree
442            .nodes
443            .borrow()
444            .get(element.value)
445            .is_some_and(|node| node.parent.is_some());
446
447        if has_parent {
448            self.append_before_sibling(element, child);
449        } else {
450            self.append(prev_element, child);
451        }
452    }
453
454    /// Append a `DOCTYPE` element to the `Document` node.
455    #[inline]
456    fn append_doctype_to_document(
457        &self,
458        name: StrTendril,
459        public_id: StrTendril,
460        system_id: StrTendril,
461    ) {
462        let root = self.tree.root_id();
463        self.tree.append_child_data_of(
464            &root,
465            NodeData::Doctype {
466                name: wrap_tendril(name),
467                public_id: wrap_tendril(public_id),
468                system_id: wrap_tendril(system_id),
469            },
470        );
471    }
472
473    /// Add each attribute to the given element, if no attribute with that name already exists. The tree builder
474    /// promises this will never be called with something else than an element.
475    fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
476        self.tree.update_node(target, |node| {
477            if let Some(el) = node.as_element_mut() {
478                el.add_attrs_if_missing(attrs);
479            }
480        });
481    }
482
483    /// Detach the given node from its parent.
484    #[inline]
485    fn remove_from_parent(&self, target: &Self::Handle) {
486        self.tree.remove_from_parent(target);
487    }
488
489    /// Remove all the children from node and append them to new_parent.
490    #[inline]
491    fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
492        self.tree.reparent_children_of(node, Some(*new_parent));
493    }
494
495    fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool {
496        self.tree.is_mathml_annotation_xml_integration_point(handle)
497    }
498}
499
500fn append_to_existing_text(prev: &NodeRef, text: &StrTendril) -> bool {
501    prev.tree
502        .update_node(&prev.id, |tree_node| match tree_node.data {
503            NodeData::Text { ref mut contents } => {
504                #[cfg(not(feature = "atomic"))]
505                contents.push_tendril(text);
506
507                #[cfg(feature = "atomic")]
508                contents.push_slice(text);
509                true
510            }
511            _ => false,
512        })
513        .unwrap_or(false)
514}
515
516#[cfg(feature = "markdown")]
517impl Document {
518    /// Produces a *Markdown* representation of the [`Document`],  
519    /// skipping elements matching the specified `skip_tags` list along with their descendants.  
520    ///  
521    /// - If `skip_tags` is `None`, the default list is used: `["script", "style", "meta", "head"]`.  
522    /// - To process all elements without exclusions, pass `Some(&[])`.
523    pub fn md(&self, skip_tags: Option<&[&str]>) -> StrTendril {
524        self.root().md(skip_tags)
525    }
526}