sws_scraper/html/
tree_sink.rs

1use std::borrow::Cow;
2
3use html5ever::tendril::StrTendril;
4use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
5use html5ever::Attribute;
6use html5ever::{ExpandedName, QualName};
7use sws_tree::NodeId;
8
9use super::Html;
10use crate::node::{Comment, Doctype, Element, Node, ProcessingInstruction, Text};
11
12/// Note: does not support the `<template>` element.
13impl TreeSink for Html {
14    type Output = Self;
15    type Handle = NodeId;
16
17    fn finish(self) -> Self {
18        self
19    }
20
21    // Signal a parse error.
22    fn parse_error(&mut self, msg: Cow<'static, str>) {
23        self.errors.push(msg);
24    }
25
26    // Set the document's quirks mode.
27    fn set_quirks_mode(&mut self, mode: QuirksMode) {
28        self.quirks_mode = mode;
29    }
30
31    // Get a handle to the Document node.
32    fn get_document(&mut self) -> Self::Handle {
33        self.tree.root().id()
34    }
35
36    // Do two handles refer to the same node?
37    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
38        x == y
39    }
40
41    // What is the name of this element?
42    //
43    // Should never be called on a non-element node; feel free to panic!.
44    fn elem_name(&self, target: &Self::Handle) -> ExpandedName {
45        self.tree
46            .get(*target)
47            .unwrap()
48            .map_value(|v| unsafe { std::mem::transmute(v.as_element().unwrap().name.expanded()) })
49            .unwrap()
50    }
51
52    // Create an element.
53    //
54    // When creating a template element (name.expanded() == expanded_name!(html "template")), an
55    // associated document fragment called the "template contents" should also be created. Later
56    // calls to self.get_template_contents() with that given element return it.
57    fn create_element(
58        &mut self,
59        name: QualName,
60        attrs: Vec<Attribute>,
61        _flags: ElementFlags,
62    ) -> Self::Handle {
63        let fragment = name.expanded() == expanded_name!(html "template");
64
65        let node_id = self
66            .tree
67            .orphan(Node::Element(Element::new(name.clone(), attrs)));
68
69        if fragment {
70            self.tree.get(node_id).unwrap().append(Node::Fragment);
71        }
72
73        node_id
74    }
75
76    // Create a comment node.
77    fn create_comment(&mut self, text: StrTendril) -> Self::Handle {
78        self.tree.orphan(Node::Comment(Comment { comment: text }))
79    }
80
81    // Append a DOCTYPE element to the Document node.
82    fn append_doctype_to_document(
83        &mut self,
84        name: StrTendril,
85        public_id: StrTendril,
86        system_id: StrTendril,
87    ) {
88        let doctype = Doctype {
89            name,
90            public_id,
91            system_id,
92        };
93        self.tree.root().append(Node::Doctype(doctype));
94    }
95
96    // Append a node as the last child of the given node. If this would produce adjacent sibling
97    // text nodes, it should concatenate the text instead.
98    //
99    // The child node will not already have a parent.
100    fn append(&mut self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
101        let mut parent = self.tree.get(*parent).unwrap();
102
103        match child {
104            NodeOrText::AppendNode(id) => {
105                parent.append_id(id);
106            }
107
108            NodeOrText::AppendText(text) => {
109                let can_concat = parent
110                    .last_child()
111                    .map_or(false, |n| n.map_value(|v| v.is_text()).unwrap_or(false));
112
113                if can_concat {
114                    let last_child = parent.last_child().unwrap();
115                    last_child.update_value(|v| match v {
116                        Node::Text(ref mut t) => t.text.push_tendril(&text),
117                        _ => unreachable!(),
118                    });
119                } else {
120                    parent.append(Node::Text(Text { text }));
121                }
122            }
123        }
124    }
125
126    // Append a node as the sibling immediately before the given node. If that node has no parent,
127    // do nothing and return Err(new_node).
128    //
129    // The tree builder promises that sibling is not a text node. However its old previous sibling,
130    // which would become the new node's previous sibling, could be a text node. If the new node is
131    // also a text node, the two should be merged, as in the behavior of append.
132    //
133    // NB: new_node may have an old parent, from which it should be removed.
134    fn append_before_sibling(
135        &mut self,
136        sibling: &Self::Handle,
137        new_node: NodeOrText<Self::Handle>,
138    ) {
139        if let NodeOrText::AppendNode(id) = new_node {
140            self.tree.get(id).unwrap().detach();
141        }
142
143        let mut sibling = self.tree.get(*sibling).unwrap();
144        if sibling.parent().is_some() {
145            match new_node {
146                NodeOrText::AppendNode(id) => {
147                    sibling.insert_id_before(id);
148                }
149
150                NodeOrText::AppendText(text) => {
151                    let can_concat = sibling
152                        .prev_sibling()
153                        .map_or(false, |n| n.map_value(|v| v.is_text()).unwrap_or(false));
154
155                    if can_concat {
156                        let prev_sibling = sibling.prev_sibling().unwrap();
157                        prev_sibling.update_value(|v| match v {
158                            Node::Text(ref mut t) => t.text.push_tendril(&text),
159                            _ => unreachable!(),
160                        });
161                    } else {
162                        sibling.insert_before(Node::Text(Text { text }));
163                    }
164                }
165            }
166        }
167    }
168
169    // Detach the given node from its parent.
170    fn remove_from_parent(&mut self, target: &Self::Handle) {
171        self.tree.get(*target).unwrap().detach();
172    }
173
174    // Remove all the children from node and append them to new_parent.
175    fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) {
176        self.tree
177            .get(*new_parent)
178            .unwrap()
179            .reparent_from_id_append(*node);
180    }
181
182    // Add each attribute to the given element, if no attribute with that name already exists. The
183    // tree builder promises this will never be called with something else than an element.
184    fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec<Attribute>) {
185        let node = self.tree.get(*target).unwrap();
186        node.update_value(|v| match v {
187            Node::Element(ref mut element) => {
188                for attr in attrs {
189                    element.attrs.entry(attr.name).or_insert(attr.value);
190                }
191            }
192            _ => unreachable!(),
193        });
194    }
195
196    // Get a handle to a template's template contents.
197    //
198    // The tree builder promises this will never be called with something else than a template
199    // element.
200    fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle {
201        self.tree.get(*target).unwrap().first_child().unwrap().id()
202    }
203
204    // Mark a HTML <script> element as "already started".
205    fn mark_script_already_started(&mut self, _node: &Self::Handle) {}
206
207    // Create Processing Instruction.
208    fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle {
209        self.tree
210            .orphan(Node::ProcessingInstruction(ProcessingInstruction {
211                target,
212                data,
213            }))
214    }
215
216    fn append_based_on_parent_node(
217        &mut self,
218        element: &Self::Handle,
219        prev_element: &Self::Handle,
220        child: NodeOrText<Self::Handle>,
221    ) {
222        if self.tree.get(*element).unwrap().parent().is_some() {
223            self.append_before_sibling(element, child)
224        } else {
225            self.append(prev_element, child)
226        }
227    }
228}