Skip to main content

markup5ever_rcdom/
lib.rs

1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! This crate is not supported, nor published by the html5ever project.
11//!
12//! This crate is built for the express purpose of writing automated tests for the html5ever and xml5ever crates. It is not intended to be a production-quality DOM implementation, and has not been fuzzed or tested against arbitrary, malicious, or nontrivial inputs. No maintenance or support for any such issues will be provided. If you use this DOM implementation in a production, user-facing system, you do so at your own risk.
13//!
14//! A simple reference-counted DOM.
15//!
16//! This is sufficient as a static parse tree, but don't build a
17//! web browser using it. :)
18//!
19//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like
20//! format. For example, the following graph
21//!
22//! ```text
23//! div
24//!  +- "text node"
25//!  +- span
26//! ```
27//! in HTML would be serialized as
28//!
29//! ```html
30//! <div>text node<span></span></div>
31//! ```
32//!
33//! See the [document object model article on wikipedia][dom wiki] for more information.
34//!
35//! This implementation stores the information associated with each node once, and then hands out
36//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create
37//! a new ref and then a node will outlive the document. Nodes own their children, but only have
38//! weak references to their parents.
39//!
40//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure)
41//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model
42
43extern crate markup5ever;
44extern crate tendril;
45
46use std::borrow::Cow;
47use std::cell::{Cell, RefCell};
48use std::collections::{HashSet, VecDeque};
49use std::default::Default;
50use std::fmt;
51use std::io;
52use std::mem;
53use std::rc::{Rc, Weak};
54
55use tendril::StrTendril;
56
57use markup5ever::interface::tree_builder;
58use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
59use markup5ever::serialize::TraversalScope;
60use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
61use markup5ever::serialize::{Serialize, Serializer};
62use markup5ever::Attribute;
63use markup5ever::ExpandedName;
64use markup5ever::QualName;
65use markup5ever::interface::ElemName;
66use markup5ever::local_name;
67
68/// The different kinds of nodes in the DOM.
69#[derive(Debug, Clone)]
70pub enum NodeData {
71    /// The `Document` itself - the root node of a HTML document.
72    Document,
73
74    /// A `DOCTYPE` with name, public id, and system id. See
75    /// [document type declaration on wikipedia][dtd wiki].
76    ///
77    /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration
78    Doctype {
79        name: StrTendril,
80        public_id: StrTendril,
81        system_id: StrTendril,
82    },
83
84    /// A text node.
85    Text { contents: RefCell<StrTendril> },
86
87    /// A comment.
88    Comment { contents: StrTendril },
89
90    /// An element with attributes.
91    Element {
92        name: QualName,
93        attrs: RefCell<Vec<Attribute>>,
94
95        /// For HTML \<template\> elements, the [template contents].
96        ///
97        /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents
98        template_contents: RefCell<Option<Handle>>,
99
100        /// Whether the node is a [HTML integration point].
101        ///
102        /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point
103        mathml_annotation_xml_integration_point: bool,
104    },
105
106    /// A Processing instruction.
107    ProcessingInstruction {
108        target: StrTendril,
109        contents: StrTendril,
110    },
111}
112
113/// A DOM node.
114pub struct Node {
115    /// Parent node.
116    pub parent: Cell<Option<WeakHandle>>,
117    /// Child nodes of this node.
118    pub children: RefCell<Vec<Handle>>,
119    /// Represents this node's data.
120    pub data: NodeData,
121}
122
123// SAFETY: `Node` contains `Cell` and `RefCell`, which are `!Sync` in the
124// general case. We manually assert `Send + Sync` here so that
125// `Arc<Node>: Send` (which requires `Node: Send + Sync`), enabling
126// futures that hold the parsed DOM to cross thread boundaries on a
127// multi-threaded async runtime.
128//
129// Soundness invariant: a `Node` (and any `Handle = Arc<Node>` /
130// `WeakHandle = Weak<Node>` referencing it) is owned by exactly one
131// task at a time. The DOM is built by the parser, traversed/cleaned/
132// serialized by callers, and dropped — all on a single logical task.
133// Tokio's work-stealing scheduler may move the future between worker
134// threads, but only one thread polls it at any instant, so no two
135// threads ever access the same `Node` simultaneously.
136//
137// Cloning a `Handle` and using one clone on another thread *while the
138// original is still in use on the source thread* is undefined behavior.
139// Don't do that.
140//
141// `StrTendril` (= `Tendril<UTF8, Atomic>` via spider-tendril) is already
142// `Send + Sync`, so the tendrils stored inside `NodeData` impose no
143// additional constraint.
144unsafe impl Send for Node {}
145unsafe impl Sync for Node {}
146
147impl Node {
148    /// Create a new node from its contents
149    pub fn new(data: NodeData) -> Rc<Self> {
150        Rc::new(Node {
151            data,
152            parent: Cell::new(None),
153            children: RefCell::new(Vec::new()),
154        })
155    }
156
157    /// <https://html.spec.whatwg.org/#option-element-nearest-ancestor-select>
158    fn get_option_element_nearest_ancestor_select(&self) -> Option<Rc<Self>> {
159        // Step 1. Let ancestorOptgroup be null.
160        // NOTE: The algorithm doesn't actually need the value, so a boolean is enough.
161        let mut did_see_ancestor_optgroup = false;
162
163        // Step 2. For each ancestor of option's ancestors, in reverse tree order:
164        let mut current = self.parent().and_then(|parent| parent.upgrade())?;
165        loop {
166            if let NodeData::Element { name, .. } = &current.data {
167                // Step 2.1 If ancestor is a datalist, hr, or option element, then return null.
168                if matches!(
169                    name.local_name(),
170                    &local_name!("datalist") | &local_name!("hr") | &local_name!("option")
171                ) {
172                    return None;
173                }
174
175                // Step 2.2 If ancestor is an optgroup element:
176                if name.local_name() == &local_name!("optgroup") {
177                    // Step 2.2.1 If ancestorOptgroup is not null, then return null.
178                    if did_see_ancestor_optgroup {
179                        return None;
180                    }
181
182                    // Step 2.2.2 Set ancestorOptgroup to ancestor.
183                    did_see_ancestor_optgroup = true;
184                }
185
186                // Step 2.3 If ancestor is a select, then return ancestor.
187                if name.local_name() == &local_name!("select") {
188                    return Some(current);
189                }
190            };
191
192            // Move on to the next ancestor
193            let Some(next_ancestor) = current.parent().and_then(|parent| parent.upgrade()) else {
194                break;
195            };
196            current = next_ancestor;
197        }
198
199        // Step 3. Return null.
200        None
201    }
202
203    fn parent(&self) -> Option<Weak<Self>> {
204        let parent = self.parent.take();
205        self.parent.set(parent.clone());
206        parent
207    }
208
209    /// <https://html.spec.whatwg.org/#select-enabled-selectedcontent>
210    fn get_a_selects_enabled_selectedcontent(&self) -> Option<Rc<Self>> {
211        // Step 1. If select has the multiple attribute, then return null.
212        let NodeData::Element { name, attrs, .. } = &self.data else {
213            panic!("Trying to get selectedcontent of non-element");
214        };
215        debug_assert_eq!(name.local_name(), &local_name!("select"));
216        if attrs
217            .borrow()
218            .iter()
219            .any(|attribute| attribute.name.local == local_name!("multiple"))
220        {
221            return None;
222        }
223
224        // Step 2. Let selectedcontent be the first selectedcontent element descendant of select in tree order
225        // if any such element exists; otherwise return null.
226        // FIXME: This does not visit the nodes in tree order
227        let mut remaining = VecDeque::default();
228        remaining.extend(self.children.borrow().iter().cloned());
229        let mut selectedcontent = None;
230        while let Some(node) = remaining.pop_front() {
231            remaining.extend(node.children.borrow().iter().cloned());
232
233            let NodeData::Element { name, .. } = &self.data else {
234                continue;
235            };
236            if name.local_name() == &local_name!("selectedcontent") {
237                selectedcontent = Some(node);
238                break;
239            }
240        }
241        let selectedcontent = selectedcontent?;
242
243        // Step 3. If selectedcontent's disabled is true, then return null.
244        // FIXME: This step is unimplemented for now to reduce complexity.
245
246        // Step 4. Return selectedcontent.
247        Some(selectedcontent)
248    }
249
250    /// <https://html.spec.whatwg.org/#clone-an-option-into-a-selectedcontent>
251    fn clone_an_option_into_selectedcontent(&self, selectedcontent: Rc<Self>) {
252        // Step 1. Let documentFragment be a new DocumentFragment whose node document is option's node document.
253        // NOTE: We just remember the children of said fragment, thats good enough.
254        let mut document_fragment = Vec::new();
255
256        // Step 2. For each child of option's children:
257        for child in self.children.borrow().iter() {
258            // Step 2.1 Let childClone be the result of running clone given child with subtree set to true.
259            let child_clone = child.clone_with_subtree();
260
261            // Step 2.2 Append childClone to documentFragment.
262            document_fragment.push(child_clone);
263        }
264
265        // Step 3. Replace all with documentFragment within selectedcontent.
266        *selectedcontent.children.borrow_mut() = document_fragment;
267    }
268
269    /// Clones the node and all of its descendants, returning a handle to the new subtree.
270    ///
271    /// This function will run into infinite recursion when the DOM tree contains cycles and it makes
272    /// no attempts to guard against that.
273    fn clone_with_subtree(&self) -> Rc<Self> {
274        let children = self
275            .children
276            .borrow()
277            .iter()
278            .map(|child| child.clone_with_subtree())
279            .collect();
280        Rc::new(Self {
281            parent: Cell::new(self.parent()),
282            data: self.data.clone(),
283            children: RefCell::new(children),
284        })
285    }
286}
287
288impl Drop for Node {
289    fn drop(&mut self) {
290        let mut nodes = mem::take(&mut *self.children.borrow_mut());
291        while let Some(node) = nodes.pop() {
292            let children = mem::take(&mut *node.children.borrow_mut());
293            nodes.extend(children.into_iter());
294            if let NodeData::Element {
295                ref template_contents,
296                ..
297            } = node.data
298            {
299                if let Some(template_contents) = template_contents.borrow_mut().take() {
300                    nodes.push(template_contents);
301                }
302            }
303        }
304    }
305}
306
307impl fmt::Debug for Node {
308    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
309        fmt.debug_struct("Node")
310            .field("data", &self.data)
311            .field("children", &self.children)
312            .finish()
313    }
314}
315
316/// Reference to a DOM node.
317pub type Handle = Rc<Node>;
318
319/// Weak reference to a DOM node, used for parent pointers.
320pub type WeakHandle = Weak<Node>;
321
322/// Append a parentless node to another nodes' children
323fn append(new_parent: &Handle, child: Handle) {
324    let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
325    // Invariant: child cannot have existing parent
326    assert!(previous_parent.is_none());
327    new_parent.children.borrow_mut().push(child);
328}
329
330/// If the node has a parent, get it and this node's position in its children
331fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
332    if let Some(weak) = target.parent.take() {
333        let parent = weak.upgrade().expect("dangling weak pointer");
334        target.parent.set(Some(weak));
335        let i = match parent
336            .children
337            .borrow()
338            .iter()
339            .enumerate()
340            .find(|&(_, child)| Rc::ptr_eq(child, target))
341        {
342            Some((i, _)) => i,
343            None => panic!("have parent but couldn't find in parent's children!"),
344        };
345        Some((parent, i))
346    } else {
347        None
348    }
349}
350
351fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
352    match prev.data {
353        NodeData::Text { ref contents } => {
354            contents.borrow_mut().push_slice(text);
355            true
356        },
357        _ => false,
358    }
359}
360
361fn remove_from_parent(target: &Handle) {
362    if let Some((parent, i)) = get_parent_and_index(target) {
363        parent.children.borrow_mut().remove(i);
364        target.parent.set(None);
365    }
366}
367
368/// The DOM itself; the result of parsing.
369pub struct RcDom {
370    /// The `Document` itself.
371    pub document: Handle,
372
373    /// Errors that occurred during parsing.
374    pub errors: RefCell<Vec<Cow<'static, str>>>,
375
376    /// The document's quirks mode.
377    pub quirks_mode: Cell<QuirksMode>,
378}
379
380impl TreeSink for RcDom {
381    type Output = Self;
382    fn finish(self) -> Self {
383        self
384    }
385
386    type Handle = Handle;
387
388    type ElemName<'a>
389        = ExpandedName<'a>
390    where
391        Self: 'a;
392
393    fn parse_error(&self, msg: Cow<'static, str>) {
394        self.errors.borrow_mut().push(msg);
395    }
396
397    fn get_document(&self) -> Handle {
398        self.document.clone()
399    }
400
401    fn get_template_contents(&self, target: &Handle) -> Handle {
402        if let NodeData::Element {
403            ref template_contents,
404            ..
405        } = target.data
406        {
407            template_contents
408                .borrow()
409                .as_ref()
410                .expect("not a template element!")
411                .clone()
412        } else {
413            panic!("not a template element!")
414        }
415    }
416
417    fn set_quirks_mode(&self, mode: QuirksMode) {
418        self.quirks_mode.set(mode);
419    }
420
421    fn same_node(&self, x: &Handle, y: &Handle) -> bool {
422        Rc::ptr_eq(x, y)
423    }
424
425    fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> {
426        match target.data {
427            NodeData::Element { ref name, .. } => name.expanded(),
428            _ => panic!("not an element!"),
429        }
430    }
431
432    fn create_element(&self, name: QualName, attrs: Vec<Attribute>, flags: ElementFlags) -> Handle {
433        Node::new(NodeData::Element {
434            name,
435            attrs: RefCell::new(attrs),
436            template_contents: RefCell::new(if flags.template {
437                Some(Node::new(NodeData::Document))
438            } else {
439                None
440            }),
441            mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
442        })
443    }
444
445    fn create_comment(&self, text: StrTendril) -> Handle {
446        Node::new(NodeData::Comment { contents: text })
447    }
448
449    fn create_pi(&self, target: StrTendril, data: StrTendril) -> Handle {
450        Node::new(NodeData::ProcessingInstruction {
451            target,
452            contents: data,
453        })
454    }
455
456    fn append(&self, parent: &Handle, child: NodeOrText<Handle>) {
457        // Append to an existing Text node if we have one.
458        if let NodeOrText::AppendText(text) = &child {
459            if let Some(h) = parent.children.borrow().last() {
460                if append_to_existing_text(h, text) {
461                    return;
462                }
463            }
464        }
465
466        append(
467            parent,
468            match child {
469                NodeOrText::AppendText(text) => Node::new(NodeData::Text {
470                    contents: RefCell::new(text),
471                }),
472                NodeOrText::AppendNode(node) => node,
473            },
474        );
475    }
476
477    fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
478        let (parent, i) = get_parent_and_index(sibling)
479            .expect("append_before_sibling called on node without parent");
480
481        let child = match (child, i) {
482            // No previous node.
483            (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text {
484                contents: RefCell::new(text),
485            }),
486
487            // Look for a text node before the insertion point.
488            (NodeOrText::AppendText(text), i) => {
489                let children = parent.children.borrow();
490                let prev = &children[i - 1];
491                if append_to_existing_text(prev, &text) {
492                    return;
493                }
494                Node::new(NodeData::Text {
495                    contents: RefCell::new(text),
496                })
497            },
498
499            // The tree builder promises we won't have a text node after
500            // the insertion point.
501
502            // Any other kind of node.
503            (NodeOrText::AppendNode(node), _) => node,
504        };
505
506        remove_from_parent(&child);
507
508        child.parent.set(Some(Rc::downgrade(&parent)));
509        parent.children.borrow_mut().insert(i, child);
510    }
511
512    fn append_based_on_parent_node(
513        &self,
514        element: &Self::Handle,
515        prev_element: &Self::Handle,
516        child: NodeOrText<Self::Handle>,
517    ) {
518        let parent = element.parent.take();
519        let has_parent = parent.is_some();
520        element.parent.set(parent);
521
522        if has_parent {
523            self.append_before_sibling(element, child);
524        } else {
525            self.append(prev_element, child);
526        }
527    }
528
529    fn append_doctype_to_document(
530        &self,
531        name: StrTendril,
532        public_id: StrTendril,
533        system_id: StrTendril,
534    ) {
535        append(
536            &self.document,
537            Node::new(NodeData::Doctype {
538                name,
539                public_id,
540                system_id,
541            }),
542        );
543    }
544
545    fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) {
546        let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
547            attrs.borrow_mut()
548        } else {
549            panic!("not an element")
550        };
551
552        let existing_names = existing
553            .iter()
554            .map(|e| e.name.clone())
555            .collect::<HashSet<_>>();
556        existing.extend(
557            attrs
558                .into_iter()
559                .filter(|attr| !existing_names.contains(&attr.name)),
560        );
561    }
562
563    fn remove_from_parent(&self, target: &Handle) {
564        remove_from_parent(target);
565    }
566
567    fn reparent_children(&self, node: &Handle, new_parent: &Handle) {
568        let mut children = node.children.borrow_mut();
569        let mut new_children = new_parent.children.borrow_mut();
570        for child in children.iter() {
571            let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
572            assert!(Rc::ptr_eq(
573                node,
574                &previous_parent.unwrap().upgrade().expect("dangling weak")
575            ))
576        }
577        new_children.extend(mem::take(&mut *children));
578    }
579
580    fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool {
581        if let NodeData::Element {
582            mathml_annotation_xml_integration_point,
583            ..
584        } = target.data
585        {
586            mathml_annotation_xml_integration_point
587        } else {
588            panic!("not an element!")
589        }
590    }
591
592    fn maybe_clone_an_option_into_selectedcontent(&self, option: &Self::Handle) {
593        let NodeData::Element { name, attrs, .. } = &option.data else {
594            panic!("\"maybe clone an option into selectedcontent\" called with non-element node");
595        };
596        debug_assert_eq!(name.local_name(), &local_name!("option"));
597
598        // Step 1. Let select be option's option element nearest ancestor select.
599        let select = option.get_option_element_nearest_ancestor_select();
600
601        // Step 2. If all of the following conditions are true:
602        // * select is not null;
603        // * option's selectedness is true; and
604        // * select's enabled selectedcontent is not null,
605        // then run clone an option into a selectedcontent given option and select's enabled selectedcontent.
606        if let Some(selectedcontent) =
607            select.and_then(|select| select.get_a_selects_enabled_selectedcontent())
608        {
609            if attrs
610                .borrow()
611                .iter()
612                .any(|attribute| attribute.name.local == local_name!("selected"))
613            {
614                option.clone_an_option_into_selectedcontent(selectedcontent);
615            }
616        }
617    }
618}
619
620impl Default for RcDom {
621    fn default() -> RcDom {
622        RcDom {
623            document: Node::new(NodeData::Document),
624            errors: Default::default(),
625            quirks_mode: Cell::new(tree_builder::NoQuirks),
626        }
627    }
628}
629
630enum SerializeOp {
631    Open(Handle),
632    Close(QualName),
633}
634
635pub struct SerializableHandle(Handle);
636
637impl From<Handle> for SerializableHandle {
638    fn from(h: Handle) -> SerializableHandle {
639        SerializableHandle(h)
640    }
641}
642
643impl Serialize for SerializableHandle {
644    fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
645    where
646        S: Serializer,
647    {
648        let mut ops = VecDeque::new();
649        match traversal_scope {
650            IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())),
651            ChildrenOnly(_) => ops.extend(
652                self.0
653                    .children
654                    .borrow()
655                    .iter()
656                    .map(|h| SerializeOp::Open(h.clone())),
657            ),
658        }
659
660        while let Some(op) = ops.pop_front() {
661            match op {
662                SerializeOp::Open(handle) => match handle.data {
663                    NodeData::Element {
664                        ref name,
665                        ref attrs,
666                        ..
667                    } => {
668                        serializer.start_elem(
669                            name.clone(),
670                            attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
671                        )?;
672
673                        ops.reserve(1 + handle.children.borrow().len());
674                        ops.push_front(SerializeOp::Close(name.clone()));
675
676                        for child in handle.children.borrow().iter().rev() {
677                            ops.push_front(SerializeOp::Open(child.clone()));
678                        }
679                    },
680
681                    NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?,
682
683                    NodeData::Text { ref contents } => serializer.write_text(&contents.borrow())?,
684
685                    NodeData::Comment { ref contents } => serializer.write_comment(contents)?,
686
687                    NodeData::ProcessingInstruction {
688                        ref target,
689                        ref contents,
690                    } => serializer.write_processing_instruction(target, contents)?,
691
692                    NodeData::Document => panic!("Can't serialize Document node itself"),
693                },
694
695                SerializeOp::Close(name) => {
696                    serializer.end_elem(name)?;
697                },
698            }
699        }
700
701        Ok(())
702    }
703}