html2md/
markup5ever_rcdom.rs

1// This file is part of commit 8415d50 and a copy of:
2// [html5ever/rcdom/lib.rs at main · servo/html5ever · GitHub](https://github.com/servo/html5ever/blob/main/rcdom/lib.rs)
3//
4// See also:
5// [`RcDom` issues in `html2html` example · Issue #555 · servo/html5ever](https://github.com/servo/html5ever/issues/555)
6//
7//
8// Copyright 2014-2017 The html5ever Project Developers. See the
9// COPYRIGHT file at the top-level directory of this distribution.
10//
11// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
12// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
13// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
14// option. This file may not be copied, modified, or distributed
15// except according to those terms.
16
17//! A simple reference-counted DOM.
18//!
19//! This is sufficient as a static parse tree, but don't build a
20//! web browser using it. :)
21//!
22//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like
23//! format. For example, the following graph
24//!
25//! ```text
26//! div
27//!  +- "text node"
28//!  +- span
29//! ```
30//! in HTML would be serialized as
31//!
32//! ```html
33//! <div>text node<span></span></div>
34//! ```
35//!
36//! See the [document object model article on wikipedia][dom wiki] for more information.
37//!
38//! This implementation stores the information associated with each node once, and then hands out
39//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create
40//! a new ref and then a node will outlive the document. Nodes own their children, but only have
41//! weak references to their parents.
42//!
43//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure)
44//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model
45
46extern crate markup5ever;
47extern crate tendril;
48
49use std::borrow::Cow;
50use std::cell::{Cell, RefCell};
51use std::collections::{HashSet, VecDeque};
52use std::default::Default;
53use std::fmt;
54use std::io;
55use std::mem;
56use std::rc::{Rc, Weak};
57
58use tendril::StrTendril;
59
60use markup5ever::interface::tree_builder;
61use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
62use markup5ever::serialize::TraversalScope;
63use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
64use markup5ever::serialize::{Serialize, Serializer};
65use markup5ever::Attribute;
66use markup5ever::ExpandedName;
67use markup5ever::QualName;
68
69/// The different kinds of nodes in the DOM.
70#[derive(Debug)]
71pub enum NodeData {
72    /// The `Document` itself - the root node of a HTML document.
73    Document,
74
75    /// A `DOCTYPE` with name, public id, and system id. See
76    /// [document type declaration on wikipedia][dtd wiki].
77    ///
78    /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration
79    Doctype {
80        name: StrTendril,
81        public_id: StrTendril,
82        system_id: StrTendril,
83    },
84
85    /// A text node.
86    Text { contents: RefCell<StrTendril> },
87
88    /// A comment.
89    Comment { contents: StrTendril },
90
91    /// An element with attributes.
92    Element {
93        name: QualName,
94        attrs: RefCell<Vec<Attribute>>,
95
96        /// For HTML \<template\> elements, the [template contents].
97        ///
98        /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents
99        template_contents: RefCell<Option<Handle>>,
100
101        /// Whether the node is a [HTML integration point].
102        ///
103        /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point
104        mathml_annotation_xml_integration_point: bool,
105    },
106
107    /// A Processing instruction.
108    ProcessingInstruction {
109        target: StrTendril,
110        contents: StrTendril,
111    },
112}
113
114/// A DOM node.
115pub struct Node {
116    /// Parent node.
117    pub parent: Cell<Option<WeakHandle>>,
118    /// Child nodes of this node.
119    pub children: RefCell<Vec<Handle>>,
120    /// Represents this node's data.
121    pub data: NodeData,
122}
123
124impl Node {
125    /// Create a new node from its contents
126    pub fn new(data: NodeData) -> Rc<Self> {
127        Rc::new(Node {
128            data,
129            parent: Cell::new(None),
130            children: RefCell::new(Vec::new()),
131        })
132    }
133}
134
135impl Drop for Node {
136    fn drop(&mut self) {
137        let mut nodes = mem::take(&mut *self.children.borrow_mut());
138        while let Some(node) = nodes.pop() {
139            let children = mem::take(&mut *node.children.borrow_mut());
140            nodes.extend(children.into_iter());
141            if let NodeData::Element {
142                ref template_contents,
143                ..
144            } = node.data
145            {
146                if let Some(template_contents) = template_contents.borrow_mut().take() {
147                    nodes.push(template_contents);
148                }
149            }
150        }
151    }
152}
153
154impl fmt::Debug for Node {
155    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
156        fmt.debug_struct("Node")
157            .field("data", &self.data)
158            .field("children", &self.children)
159            .finish()
160    }
161}
162
163/// Reference to a DOM node.
164pub type Handle = Rc<Node>;
165
166/// Weak reference to a DOM node, used for parent pointers.
167pub type WeakHandle = Weak<Node>;
168
169/// Append a parentless node to another nodes' children
170fn append(new_parent: &Handle, child: Handle) {
171    let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
172    // Invariant: child cannot have existing parent
173    assert!(previous_parent.is_none());
174    new_parent.children.borrow_mut().push(child);
175}
176
177/// If the node has a parent, get it and this node's position in its children
178fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
179    match target.parent.take() { Some(weak) => {
180        let parent = weak.upgrade().expect("dangling weak pointer");
181        target.parent.set(Some(weak));
182        let i = match parent
183            .children
184            .borrow()
185            .iter()
186            .enumerate()
187            .find(|&(_, child)| Rc::ptr_eq(child, target))
188        {
189            Some((i, _)) => i,
190            None => panic!("have parent but couldn't find in parent's children!"),
191        };
192        Some((parent, i))
193    } _ => {
194        None
195    }}
196}
197
198fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
199    match prev.data {
200        NodeData::Text { ref contents } => {
201            contents.borrow_mut().push_slice(text);
202            true
203        }
204        _ => false,
205    }
206}
207
208fn remove_from_parent(target: &Handle) {
209    if let Some((parent, i)) = get_parent_and_index(target) {
210        parent.children.borrow_mut().remove(i);
211        target.parent.set(None);
212    }
213}
214
215/// The DOM itself; the result of parsing.
216pub struct RcDom {
217    /// The `Document` itself.
218    pub document: Handle,
219
220    /// Errors that occurred during parsing.
221    pub errors: RefCell<Vec<Cow<'static, str>>>,
222
223    /// The document's quirks mode.
224    pub quirks_mode: Cell<QuirksMode>,
225}
226
227impl TreeSink for RcDom {
228    type Output = Self;
229    fn finish(self) -> Self {
230        self
231    }
232
233    type Handle = Handle;
234
235    type ElemName<'a>
236        = ExpandedName<'a>
237    where
238        Self: 'a;
239
240    fn parse_error(&self, msg: Cow<'static, str>) {
241        self.errors.borrow_mut().push(msg);
242    }
243
244    fn get_document(&self) -> Handle {
245        self.document.clone()
246    }
247
248    fn get_template_contents(&self, target: &Handle) -> Handle {
249        if let NodeData::Element {
250            ref template_contents,
251            ..
252        } = target.data
253        {
254            template_contents
255                .borrow()
256                .as_ref()
257                .expect("not a template element!")
258                .clone()
259        } else {
260            panic!("not a template element!")
261        }
262    }
263
264    fn set_quirks_mode(&self, mode: QuirksMode) {
265        self.quirks_mode.set(mode);
266    }
267
268    fn same_node(&self, x: &Handle, y: &Handle) -> bool {
269        Rc::ptr_eq(x, y)
270    }
271
272    fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> {
273        match target.data {
274            NodeData::Element { ref name, .. } => name.expanded(),
275            _ => panic!("not an element!"),
276        }
277    }
278
279    fn create_element(&self, name: QualName, attrs: Vec<Attribute>, flags: ElementFlags) -> Handle {
280        Node::new(NodeData::Element {
281            name,
282            attrs: RefCell::new(attrs),
283            template_contents: RefCell::new(if flags.template {
284                Some(Node::new(NodeData::Document))
285            } else {
286                None
287            }),
288            mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
289        })
290    }
291
292    fn create_comment(&self, text: StrTendril) -> Handle {
293        Node::new(NodeData::Comment { contents: text })
294    }
295
296    fn create_pi(&self, target: StrTendril, data: StrTendril) -> Handle {
297        Node::new(NodeData::ProcessingInstruction {
298            target,
299            contents: data,
300        })
301    }
302
303    fn append(&self, parent: &Handle, child: NodeOrText<Handle>) {
304        // Append to an existing Text node if we have one.
305        if let NodeOrText::AppendText(text) = &child {
306            if let Some(h) = parent.children.borrow().last() {
307                if append_to_existing_text(h, text) {
308                    return;
309                }
310            }
311        }
312
313        append(
314            parent,
315            match child {
316                NodeOrText::AppendText(text) => Node::new(NodeData::Text {
317                    contents: RefCell::new(text),
318                }),
319                NodeOrText::AppendNode(node) => node,
320            },
321        );
322    }
323
324    fn append_before_sibling(&self, sibling: &Handle, child: NodeOrText<Handle>) {
325        let (parent, i) = get_parent_and_index(sibling)
326            .expect("append_before_sibling called on node without parent");
327
328        let child = match (child, i) {
329            // No previous node.
330            (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text {
331                contents: RefCell::new(text),
332            }),
333
334            // Look for a text node before the insertion point.
335            (NodeOrText::AppendText(text), i) => {
336                let children = parent.children.borrow();
337                let prev = &children[i - 1];
338                if append_to_existing_text(prev, &text) {
339                    return;
340                }
341                Node::new(NodeData::Text {
342                    contents: RefCell::new(text),
343                })
344            }
345
346            // The tree builder promises we won't have a text node after
347            // the insertion point.
348
349            // Any other kind of node.
350            (NodeOrText::AppendNode(node), _) => node,
351        };
352
353        remove_from_parent(&child);
354
355        child.parent.set(Some(Rc::downgrade(&parent)));
356        parent.children.borrow_mut().insert(i, child);
357    }
358
359    fn append_based_on_parent_node(
360        &self,
361        element: &Self::Handle,
362        prev_element: &Self::Handle,
363        child: NodeOrText<Self::Handle>,
364    ) {
365        let parent = element.parent.take();
366        let has_parent = parent.is_some();
367        element.parent.set(parent);
368
369        if has_parent {
370            self.append_before_sibling(element, child);
371        } else {
372            self.append(prev_element, child);
373        }
374    }
375
376    fn append_doctype_to_document(
377        &self,
378        name: StrTendril,
379        public_id: StrTendril,
380        system_id: StrTendril,
381    ) {
382        append(
383            &self.document,
384            Node::new(NodeData::Doctype {
385                name,
386                public_id,
387                system_id,
388            }),
389        );
390    }
391
392    fn add_attrs_if_missing(&self, target: &Handle, attrs: Vec<Attribute>) {
393        let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
394            attrs.borrow_mut()
395        } else {
396            panic!("not an element")
397        };
398
399        let existing_names = existing
400            .iter()
401            .map(|e| e.name.clone())
402            .collect::<HashSet<_>>();
403        existing.extend(
404            attrs
405                .into_iter()
406                .filter(|attr| !existing_names.contains(&attr.name)),
407        );
408    }
409
410    fn remove_from_parent(&self, target: &Handle) {
411        remove_from_parent(target);
412    }
413
414    fn reparent_children(&self, node: &Handle, new_parent: &Handle) {
415        let mut children = node.children.borrow_mut();
416        let mut new_children = new_parent.children.borrow_mut();
417        for child in children.iter() {
418            let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
419            assert!(Rc::ptr_eq(
420                node,
421                &previous_parent.unwrap().upgrade().expect("dangling weak")
422            ))
423        }
424        new_children.extend(mem::take(&mut *children));
425    }
426
427    fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool {
428        if let NodeData::Element {
429            mathml_annotation_xml_integration_point,
430            ..
431        } = target.data
432        {
433            mathml_annotation_xml_integration_point
434        } else {
435            panic!("not an element!")
436        }
437    }
438}
439
440impl Default for RcDom {
441    fn default() -> RcDom {
442        RcDom {
443            document: Node::new(NodeData::Document),
444            errors: Default::default(),
445            quirks_mode: Cell::new(tree_builder::NoQuirks),
446        }
447    }
448}
449
450enum SerializeOp {
451    Open(Handle),
452    Close(QualName),
453}
454
455pub struct SerializableHandle(Handle);
456
457impl From<Handle> for SerializableHandle {
458    fn from(h: Handle) -> SerializableHandle {
459        SerializableHandle(h)
460    }
461}
462
463impl Serialize for SerializableHandle {
464    fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
465    where
466        S: Serializer,
467    {
468        let mut ops = VecDeque::new();
469        match traversal_scope {
470            IncludeNode => ops.push_back(SerializeOp::Open(self.0.clone())),
471            ChildrenOnly(_) => ops.extend(
472                self.0
473                    .children
474                    .borrow()
475                    .iter()
476                    .map(|h| SerializeOp::Open(h.clone())),
477            ),
478        }
479
480        while let Some(op) = ops.pop_front() {
481            match op {
482                SerializeOp::Open(handle) => match handle.data {
483                    NodeData::Element {
484                        ref name,
485                        ref attrs,
486                        ..
487                    } => {
488                        serializer.start_elem(
489                            name.clone(),
490                            attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
491                        )?;
492
493                        ops.reserve(1 + handle.children.borrow().len());
494                        ops.push_front(SerializeOp::Close(name.clone()));
495
496                        for child in handle.children.borrow().iter().rev() {
497                            ops.push_front(SerializeOp::Open(child.clone()));
498                        }
499                    }
500
501                    NodeData::Doctype { ref name, .. } => serializer.write_doctype(name)?,
502
503                    NodeData::Text { ref contents } => serializer.write_text(&contents.borrow())?,
504
505                    NodeData::Comment { ref contents } => serializer.write_comment(contents)?,
506
507                    NodeData::ProcessingInstruction {
508                        ref target,
509                        ref contents,
510                    } => serializer.write_processing_instruction(target, contents)?,
511
512                    NodeData::Document => panic!("Can't serialize Document node itself"),
513                },
514
515                SerializeOp::Close(name) => {
516                    serializer.end_elem(name)?;
517                }
518            }
519        }
520
521        Ok(())
522    }
523}