dom_query/document.rs
1use std::borrow::Cow;
2use std::cell::{Cell, Ref, RefCell};
3
4#[allow(unused_imports)]
5use html5ever::namespace_url;
6use html5ever::parse_document;
7use html5ever::tree_builder;
8use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
9use html5ever::ParseOpts;
10use html5ever::{local_name, ns};
11use html5ever::{Attribute, QualName};
12
13use tendril::{StrTendril, TendrilSink};
14
15use crate::dom_tree::Tree;
16use crate::entities::wrap_tendril;
17use crate::matcher::{DescendantMatches, Matcher};
18use crate::node::{Element, NodeData, NodeId, NodeRef, TreeNode};
19use crate::selection::Selection;
20/// Document represents an HTML document to be manipulated.
21#[derive(Clone)]
22pub struct Document {
23 /// The document's dom tree.
24 pub tree: Tree,
25
26 /// Errors that occurred during parsing.
27 pub errors: RefCell<Vec<Cow<'static, str>>>,
28
29 /// The document's quirks mode.
30 pub quirks_mode: Cell<QuirksMode>,
31}
32
33impl Default for Document {
34 fn default() -> Self {
35 Self {
36 tree: Tree::new(NodeData::Document),
37 errors: RefCell::new(vec![]),
38 quirks_mode: Cell::new(tree_builder::NoQuirks),
39 }
40 }
41}
42
43impl<T: Into<StrTendril>> From<T> for Document {
44 fn from(html: T) -> Self {
45 let opts = ParseOpts {
46 tokenizer: Default::default(),
47 tree_builder: tree_builder::TreeBuilderOpts {
48 scripting_enabled: false,
49 ..Default::default()
50 },
51 };
52 parse_document(Document::default(), opts).one(html)
53 }
54}
55
56// fragment
57impl Document {
58 /// Creates a new HTML document fragment.
59 pub fn fragment<T: Into<StrTendril>>(html: T) -> Self {
60 // Note: The `body` context element is somehow ignored during parsing,
61 // so the `html` element becomes the first child of the root node,
62 // rather than being nested inside a `body` element as expected.
63 html5ever::parse_fragment(
64 Document::fragment_sink(),
65 ParseOpts {
66 tokenizer: Default::default(),
67 tree_builder: tree_builder::TreeBuilderOpts {
68 scripting_enabled: false,
69 drop_doctype: true,
70 ..Default::default()
71 },
72 },
73 QualName::new(None, ns!(html), local_name!("body")),
74 Vec::new(),
75 false,
76 )
77 .one(html)
78 }
79 /// Create a new sink for a html document fragment
80 pub fn fragment_sink() -> Self {
81 Self {
82 tree: Tree::new(NodeData::Fragment),
83 errors: RefCell::new(vec![]),
84 quirks_mode: Cell::new(tree_builder::NoQuirks),
85 }
86 }
87}
88
89// property methods
90impl Document {
91 /// Return the underlying root document node.
92 #[inline]
93 pub fn root(&self) -> NodeRef<'_> {
94 self.tree.root()
95 }
96
97 /// Returns the root element node (`<html>`) of the document.
98 pub fn html_root(&self) -> NodeRef<'_> {
99 self.tree.html_root()
100 }
101
102 /// Gets the HTML contents of the document. It includes
103 /// the text and comment nodes.
104 pub fn html(&self) -> StrTendril {
105 self.root().html()
106 }
107
108 /// Gets the HTML contents of the document.
109 /// It includes only children nodes.
110 pub fn inner_html(&self) -> StrTendril {
111 self.root().inner_html()
112 }
113
114 /// Gets the HTML contents of the document.
115 /// It includes its children nodes.
116 pub fn try_html(&self) -> Option<StrTendril> {
117 self.root().try_html()
118 }
119
120 /// Gets the HTML contents of the document.
121 /// It includes only children nodes.
122 pub fn try_inner_html(&self) -> Option<StrTendril> {
123 self.root().try_inner_html()
124 }
125
126 /// Gets the text content of the document.
127 pub fn text(&self) -> StrTendril {
128 self.root().text()
129 }
130
131 /// Returns the formatted text of the document and its descendants. This is the same as
132 /// the `text()` method, but with a few differences:
133 ///
134 /// - Whitespace is normalized so that there is only one space between words.
135 /// - All whitespace is removed from the beginning and end of the text.
136 /// - After block elements, a double newline is added.
137 /// - For elements like `br`, 'hr', 'li', 'tr' a single newline is added.
138 pub fn formatted_text(&self) -> StrTendril {
139 self.root().formatted_text()
140 }
141
142 /// Finds the base URI of the tree by looking for `<base>` tags in document's head.
143 ///
144 /// The base URI is the value of the `href` attribute of the first
145 /// `<base>` tag in the document's head. If no such tag is found,
146 /// the method returns `None`.
147 ///
148 pub fn base_uri(&self) -> Option<StrTendril> {
149 self.tree.base_uri()
150 }
151
152 /// Returns the document's `<body>` element, or `None` if absent.
153 /// For fragments ([crate::NodeData::Fragment]), this typically returns `None`.
154 pub fn body(&self) -> Option<NodeRef<'_>> {
155 self.tree.body()
156 }
157
158 /// Returns the document's `<head>` element, or `None` if absent.
159 /// For fragments ([crate::NodeData::Fragment]), this typically returns `None`.
160 pub fn head(&self) -> Option<NodeRef<'_>> {
161 self.tree.head()
162 }
163
164 /// Merges adjacent text nodes and removes empty text nodes.
165 ///
166 /// Normalization is necessary to ensure that adjacent text nodes are merged into one text node.
167 ///
168 /// # Example
169 ///
170 /// ```
171 /// use dom_query::Document;
172 ///
173 /// let doc = Document::from("<div>Hello</div>");
174 /// let sel = doc.select("div");
175 /// let div = sel.nodes().first().unwrap();
176 /// let text1 = doc.tree.new_text(" ");
177 /// let text2 = doc.tree.new_text("World");
178 /// let text3 = doc.tree.new_text("");
179 /// div.append_child(&text1);
180 /// div.append_child(&text2);
181 /// div.append_child(&text3);
182 /// assert_eq!(div.children().len(), 4);
183 /// doc.normalize();
184 /// assert_eq!(div.children().len(), 1);
185 /// assert_eq!(&div.text(), "Hello World");
186 /// ```
187 pub fn normalize(&self) {
188 self.root().normalize();
189 }
190}
191
192// traversal methods
193impl Document {
194 /// Gets the descendants of the root document node in the current, filter by a selector.
195 /// It returns a new selection object containing these matched elements.
196 ///
197 /// # Panics
198 ///
199 /// Panics if failed to parse the given CSS selector.
200 pub fn select(&self, sel: &str) -> Selection<'_> {
201 let matcher = Matcher::new(sel).expect("Invalid CSS selector");
202 self.select_matcher(&matcher)
203 }
204
205 /// Alias for `select`, it gets the descendants of the root document node in the current, filter by a selector.
206 /// It returns a new selection object containing these matched elements.
207 ///
208 /// # Panics
209 ///
210 /// Panics if failed to parse the given CSS selector.
211 pub fn nip(&self, sel: &str) -> Selection<'_> {
212 self.select(sel)
213 }
214
215 /// Gets the descendants of the root document node in the current, filter by a selector.
216 /// It returns a new selection object containing these matched elements.
217 pub fn try_select(&self, sel: &str) -> Option<Selection<'_>> {
218 Matcher::new(sel).ok().and_then(|matcher| {
219 let selection = self.select_matcher(&matcher);
220 if !selection.is_empty() {
221 Some(selection)
222 } else {
223 None
224 }
225 })
226 }
227
228 /// Gets the descendants of the root document node in the current, filter by a matcher.
229 /// It returns a new selection object containing these matched elements.
230 pub fn select_matcher(&self, matcher: &Matcher) -> Selection<'_> {
231 let root = self.tree.root();
232 let nodes = DescendantMatches::new(root, matcher).collect();
233
234 Selection { nodes }
235 }
236
237 /// Gets the descendants of the root document node in the current, filter by a matcher.
238 /// It returns a new selection object containing elements of the single (first) match.
239 pub fn select_single_matcher(&self, matcher: &Matcher) -> Selection<'_> {
240 let node = DescendantMatches::new(self.tree.root(), matcher).next();
241
242 match node {
243 Some(node) => node.into(),
244 None => Default::default(),
245 }
246 }
247
248 /// Gets the descendants of the root document node in the current, filter by a selector.
249 /// It returns a new selection object containing elements of the single (first) match.
250 ///
251 /// # Panics
252 ///
253 /// Panics if failed to parse the given CSS selector.
254 pub fn select_single(&self, sel: &str) -> Selection<'_> {
255 let matcher = Matcher::new(sel).expect("Invalid CSS selector");
256 self.select_single_matcher(&matcher)
257 }
258}
259
260impl TreeSink for Document {
261 type ElemName<'a> = Ref<'a, QualName>;
262 /// The overall result of parsing.
263 type Output = Self;
264 /// Handle is a reference to a DOM node. The tree builder requires that a `Handle` implements `Clone` to get
265 /// another reference to the same node.
266 type Handle = NodeId;
267
268 /// Consume this sink and return the overall result of parsing.
269 #[inline]
270 fn finish(self) -> Self {
271 self
272 }
273
274 /// Signal a parse error.
275 #[inline]
276 fn parse_error(&self, msg: Cow<'static, str>) {
277 let mut errors = self.errors.borrow_mut();
278 errors.push(msg);
279 }
280
281 /// Get a handle to the `Document` node.
282 #[inline]
283 fn get_document(&self) -> Self::Handle {
284 self.tree.root_id()
285 }
286
287 /// Get a handle to a template's template contents. The tree builder promises this will never be called with
288 /// something else than a template element.
289 #[inline]
290 fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
291 self.tree
292 .query_node_or(target, None, |node| {
293 node.as_element().and_then(|elem| elem.template_contents)
294 })
295 .expect("target node is not a template element!")
296 }
297
298 /// Set the document's quirks mode.
299 #[inline]
300 fn set_quirks_mode(&self, mode: QuirksMode) {
301 self.quirks_mode.set(mode);
302 }
303
304 /// Do two handles refer to the same node?.
305 #[inline]
306 fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
307 *x == *y
308 }
309
310 /// What is the name of the element?
311 /// Should never be called on a non-element node; Feel free to `panic!`.
312 #[inline]
313 fn elem_name(&self, target: &Self::Handle) -> Self::ElemName<'_> {
314 self.tree
315 .get_name(target)
316 .expect("target node is not an element!")
317 }
318
319 /// Create an element.
320 /// When creating a template element (`name.ns.expanded() == expanded_name!(html"template")`), an
321 /// associated document fragment called the "template contents" should also be created. Later calls to
322 /// self.get_template_contents() with that given element return it. See `the template element in the whatwg spec`,
323 #[inline]
324 fn create_element(
325 &self,
326 name: QualName,
327 attrs: Vec<Attribute>,
328 flags: ElementFlags,
329 ) -> Self::Handle {
330 let mut nodes = self.tree.nodes.borrow_mut();
331 let new_elem_id = NodeId::new(nodes.len());
332 let template_contents = if flags.template {
333 Some(NodeId::new(nodes.len() + 1))
334 } else {
335 None
336 };
337
338 let data = NodeData::Element(Element::new(
339 name,
340 attrs,
341 template_contents,
342 flags.mathml_annotation_xml_integration_point,
343 ));
344
345 nodes.push(TreeNode::new(new_elem_id, data));
346
347 if let Some(fragment_id) = template_contents {
348 nodes.push(TreeNode::new(fragment_id, NodeData::Fragment));
349 // The template's content is considered outside of the main document,
350 // so its DocumentFragment remains parentless.
351 }
352
353 new_elem_id
354 }
355
356 /// Create a comment node.
357 #[inline]
358 fn create_comment(&self, text: StrTendril) -> Self::Handle {
359 self.tree.create_node(NodeData::Comment {
360 contents: wrap_tendril(text),
361 })
362 }
363
364 /// Create a Processing Instruction node.
365 #[inline]
366 fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle {
367 self.tree.create_node(NodeData::ProcessingInstruction {
368 target: wrap_tendril(target),
369 contents: wrap_tendril(data),
370 })
371 }
372
373 /// Append a node as the last child of the given node. If this would produce adjacent sibling text nodes, it
374 /// should concatenate the text instead.
375 /// The child node will not already have a parent.
376 fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
377 // Append to an existing Text node if we have one.
378
379 match child {
380 NodeOrText::AppendNode(node_id) => self.tree.append_child_of(parent, &node_id),
381 NodeOrText::AppendText(text) => {
382 let last_child = self.tree.last_child_of(parent);
383 let merged = last_child
384 .map(|child| append_to_existing_text(&child, &text))
385 .unwrap_or(false);
386
387 if merged {
388 return;
389 }
390
391 self.tree.append_child_data_of(
392 parent,
393 NodeData::Text {
394 contents: wrap_tendril(text),
395 },
396 )
397 }
398 }
399 }
400
401 /// Append a node as the sibling immediately before the given node.
402 /// The tree builder promises that `sibling` is not a text node. However its old previous sibling, which would
403 /// become the new node's previous sibling, could be a text node. If the new node is also a text node, the two
404 /// should be merged, as in the behavior of `append`.
405 fn append_before_sibling(&self, sibling: &Self::Handle, child: NodeOrText<Self::Handle>) {
406 match child {
407 NodeOrText::AppendText(text) => {
408 let prev_sibling = self.tree.prev_sibling_of(sibling);
409 let merged = prev_sibling
410 .map(|sibling| append_to_existing_text(&sibling, &text))
411 .unwrap_or(false);
412
413 if merged {
414 return;
415 }
416
417 let id = self.tree.create_node(NodeData::Text {
418 contents: wrap_tendril(text),
419 });
420 self.tree.insert_before_of(sibling, &id);
421 }
422
423 // The tree builder promises we won't have a text node after
424 // the insertion point.
425
426 // Any other kind of node.
427 NodeOrText::AppendNode(id) => self.tree.insert_before_of(sibling, &id),
428 };
429 }
430
431 /// When the insertion point is decided by the existence of a parent node of the element, we consider both
432 /// possibilities and send the element which will be used if a parent node exists, along with the element to be
433 /// used if there isn't one.
434 fn append_based_on_parent_node(
435 &self,
436 element: &Self::Handle,
437 prev_element: &Self::Handle,
438 child: NodeOrText<Self::Handle>,
439 ) {
440 let has_parent = self
441 .tree
442 .nodes
443 .borrow()
444 .get(element.value)
445 .is_some_and(|node| node.parent.is_some());
446
447 if has_parent {
448 self.append_before_sibling(element, child);
449 } else {
450 self.append(prev_element, child);
451 }
452 }
453
454 /// Append a `DOCTYPE` element to the `Document` node.
455 #[inline]
456 fn append_doctype_to_document(
457 &self,
458 name: StrTendril,
459 public_id: StrTendril,
460 system_id: StrTendril,
461 ) {
462 let root = self.tree.root_id();
463 self.tree.append_child_data_of(
464 &root,
465 NodeData::Doctype {
466 name: wrap_tendril(name),
467 public_id: wrap_tendril(public_id),
468 system_id: wrap_tendril(system_id),
469 },
470 );
471 }
472
473 /// Add each attribute to the given element, if no attribute with that name already exists. The tree builder
474 /// promises this will never be called with something else than an element.
475 fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
476 self.tree.update_node(target, |node| {
477 if let Some(el) = node.as_element_mut() {
478 el.add_attrs_if_missing(attrs);
479 }
480 });
481 }
482
483 /// Detach the given node from its parent.
484 #[inline]
485 fn remove_from_parent(&self, target: &Self::Handle) {
486 self.tree.remove_from_parent(target);
487 }
488
489 /// Remove all the children from node and append them to new_parent.
490 #[inline]
491 fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
492 self.tree.reparent_children_of(node, Some(*new_parent));
493 }
494
495 fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool {
496 self.tree.is_mathml_annotation_xml_integration_point(handle)
497 }
498}
499
500fn append_to_existing_text(prev: &NodeRef, text: &StrTendril) -> bool {
501 prev.tree
502 .update_node(&prev.id, |tree_node| match tree_node.data {
503 NodeData::Text { ref mut contents } => {
504 #[cfg(not(feature = "atomic"))]
505 contents.push_tendril(text);
506
507 #[cfg(feature = "atomic")]
508 contents.push_slice(text);
509 true
510 }
511 _ => false,
512 })
513 .unwrap_or(false)
514}
515
516#[cfg(feature = "markdown")]
517impl Document {
518 /// Produces a *Markdown* representation of the [`Document`],
519 /// skipping elements matching the specified `skip_tags` list along with their descendants.
520 ///
521 /// - If `skip_tags` is `None`, the default list is used: `["script", "style", "meta", "head"]`.
522 /// - To process all elements without exclusions, pass `Some(&[])`.
523 pub fn md(&self, skip_tags: Option<&[&str]>) -> StrTendril {
524 self.root().md(skip_tags)
525 }
526}