css_inline/html/
parser.rs

1use super::{
2    attributes::should_ignore,
3    document::Document,
4    node::{ElementData, Node, NodeData, NodeId},
5};
6use html5ever::{
7    expanded_name, local_name, namespace_url, ns,
8    tendril::{StrTendril, TendrilSink},
9    tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink},
10    Attribute, ExpandedName, QualName,
11};
12use std::borrow::Cow;
13
14#[derive(Debug, Copy, Clone)]
15pub(crate) enum InliningMode {
16    /// Parse the input as a full HTML document.
17    Document,
18    /// Parse the input as an HTML fragment.
19    Fragment,
20}
21
22/// Parse input bytes into an HTML document.
23pub(crate) fn parse_with_options(
24    bytes: &[u8],
25    preallocate_node_capacity: usize,
26    mode: InliningMode,
27) -> Document {
28    let sink = Sink {
29        document: Document::with_capacity(preallocate_node_capacity),
30    };
31    let options = html5ever::ParseOpts::default();
32    match mode {
33        InliningMode::Document => html5ever::parse_document(sink, options)
34            .from_utf8()
35            .one(bytes),
36        InliningMode::Fragment => {
37            let mut document = html5ever::parse_fragment(
38                sink,
39                options,
40                QualName::new(None, ns!(html), local_name!("")),
41                vec![],
42            )
43            .from_utf8()
44            .one(bytes);
45            let document_id = NodeId::document_id();
46            let context_element_id = NodeId::new(
47                document_id
48                    .get()
49                    // The first one is a node representing the "" element passed above, then the
50                    // second one is the "html" element.
51                    .checked_add(2)
52                    .expect("Document id is too small to overflow"),
53            );
54            document.reparent_children(context_element_id, document_id);
55            document
56        }
57    }
58}
59
60/// Intermediary structure for parsing an HTML document.
61/// It takes care of creating and appending nodes to the document as the parsing progresses.
62struct Sink {
63    /// An HTML document that is being parsed.
64    document: Document,
65}
66
67impl Sink {
68    /// Push a new node into the document.
69    fn push_node(&mut self, data: NodeData) -> NodeId {
70        self.document.push_node(data)
71    }
72
73    fn push_element(
74        &mut self,
75        name: QualName,
76        attributes: Vec<Attribute>,
77        inlining_ignored: bool,
78    ) -> NodeId {
79        let node_id = self.push_node(NodeData::Element {
80            element: ElementData::new(name, attributes),
81            inlining_ignored,
82        });
83        self.document.push_element_id(node_id);
84        node_id
85    }
86
87    fn push_text(&mut self, text: StrTendril) -> NodeId {
88        self.push_node(NodeData::Text { text })
89    }
90
91    fn push_comment(&mut self, text: StrTendril) -> NodeId {
92        self.push_node(NodeData::Comment { text })
93    }
94
95    fn push_processing_instruction(&mut self, target: StrTendril, data: StrTendril) -> NodeId {
96        self.push_node(NodeData::ProcessingInstruction { target, data })
97    }
98
99    fn push_doctype(&mut self, name: StrTendril) -> NodeId {
100        self.push_node(NodeData::Doctype { name })
101    }
102
103    /// Append a new node or text to the document.
104    fn append_impl<P, A>(&mut self, child: NodeOrText<NodeId>, previous: P, append: A)
105    where
106        P: FnOnce(&mut Document) -> Option<NodeId>,
107        A: FnOnce(&mut Document, NodeId),
108    {
109        let new_node = match child {
110            NodeOrText::AppendText(text) => {
111                // If the previous node is a text node, append to it.
112                // Otherwise create a new text node.
113                if let Some(id) = previous(&mut self.document) {
114                    if let Node {
115                        data: NodeData::Text { text: existing },
116                        ..
117                    } = &mut self.document[id]
118                    {
119                        existing.push_tendril(&text);
120                        return;
121                    }
122                }
123                self.push_text(text)
124            }
125            NodeOrText::AppendNode(node) => node,
126        };
127
128        append(&mut self.document, new_node);
129    }
130}
131
132impl TreeSink for Sink {
133    type Handle = NodeId;
134    type Output = Document;
135
136    fn finish(self) -> Document {
137        self.document
138    }
139
140    fn parse_error(&mut self, _msg: Cow<'static, str>) {}
141
142    fn get_document(&mut self) -> NodeId {
143        NodeId::document_id()
144    }
145
146    fn elem_name<'a>(&'a self, &target: &'a NodeId) -> ExpandedName<'a> {
147        self.document[target]
148            .as_element()
149            // The `TreeSink` trait promises to never call this method on non-element node
150            .expect("Not an element")
151            .name
152            .expanded()
153    }
154
155    fn create_element(
156        &mut self,
157        name: QualName,
158        attrs: Vec<Attribute>,
159        _flags: ElementFlags,
160    ) -> NodeId {
161        // Determine if we should ignore inlining for this element based on its attributes
162        let inlining_ignored = should_ignore(&attrs);
163
164        // Determine if the element is a `style` element or a linked stylesheet (`link` with `rel="stylesheet"`).
165        let (is_style, is_stylesheet) = {
166            // If inlining is ignored, we consider neither to be true.
167            if inlining_ignored {
168                (false, false)
169            } else if name.expanded() == expanded_name!(html "style") {
170                (true, false)
171            } else if name.expanded() == expanded_name!(html "link") {
172                let mut rel_stylesheet = false;
173                let mut href_non_empty = false;
174                for attr in &attrs {
175                    if attr.name.local == local_name!("rel") && attr.value == "stylesheet".into() {
176                        rel_stylesheet = true;
177                    }
178                    // Skip links with empty `href` attributes
179                    if attr.name.local == local_name!("href") && !attr.value.is_empty() {
180                        href_non_empty = true;
181                    }
182                    if rel_stylesheet && href_non_empty {
183                        break;
184                    }
185                }
186                (false, rel_stylesheet && href_non_empty)
187            } else {
188                (false, false)
189            }
190        };
191        let element = self.push_element(name, attrs, inlining_ignored);
192        // Collect `style` tags and linked stylesheets separately to use them for CSS inlining later.
193        if is_style {
194            self.document.add_style(element);
195        }
196        if is_stylesheet {
197            self.document.add_linked_stylesheet(element);
198        }
199        element
200    }
201
202    fn create_comment(&mut self, text: StrTendril) -> NodeId {
203        self.push_comment(text)
204    }
205
206    fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> NodeId {
207        self.push_processing_instruction(target, data)
208    }
209
210    /// Append a node as the last child of the given node.
211    fn append(&mut self, &parent: &NodeId, child: NodeOrText<NodeId>) {
212        self.append_impl(
213            child,
214            |document| document[parent].last_child,
215            |document, new_node| document.append(parent, new_node),
216        );
217    }
218
219    fn append_based_on_parent_node(
220        &mut self,
221        element: &NodeId,
222        prev_element: &NodeId,
223        child: NodeOrText<NodeId>,
224    ) {
225        if self.document[*element].parent.is_some() {
226            self.append_before_sibling(element, child);
227        } else {
228            self.append(prev_element, child);
229        }
230    }
231
232    /// Append a `DOCTYPE` element to the `Document` node.
233    fn append_doctype_to_document(
234        &mut self,
235        name: StrTendril,
236        _public_id: StrTendril,
237        _system_id: StrTendril,
238    ) {
239        let node = self.push_doctype(name);
240        self.document.append(NodeId::document_id(), node);
241    }
242
243    fn get_template_contents(&mut self, &target: &NodeId) -> NodeId {
244        target
245    }
246
247    /// Do two handles refer to the same node?
248    fn same_node(&self, x: &NodeId, y: &NodeId) -> bool {
249        x == y
250    }
251
252    fn set_quirks_mode(&mut self, _mode: QuirksMode) {}
253
254    /// Append a node as the sibling immediately before the given node.
255    fn append_before_sibling(&mut self, &sibling: &NodeId, child: NodeOrText<NodeId>) {
256        self.append_impl(
257            child,
258            |document| document[sibling].previous_sibling,
259            |document, node| document.insert_before(sibling, node),
260        );
261    }
262
263    /// Add each attribute to the given element, if no attribute with that name already exists.
264    fn add_attrs_if_missing(&mut self, &target: &NodeId, attrs: Vec<Attribute>) {
265        let element = self.document[target]
266            .as_element_mut()
267            .expect("not an element");
268        let attributes = &mut element.attributes;
269        for attr in attrs {
270            if attributes
271                .attributes
272                .iter()
273                .any(|entry| entry.name == attr.name)
274            {
275                attributes.attributes.push(attr);
276            }
277        }
278    }
279
280    /// Detach the given node from its parent.
281    fn remove_from_parent(&mut self, &target: &NodeId) {
282        self.document.detach(target);
283    }
284
285    /// Remove all the children from node and append them to `new_parent`.
286    fn reparent_children(&mut self, node: &NodeId, new_parent: &NodeId) {
287        self.document.reparent_children(*node, *new_parent);
288    }
289}