Skip to main content

blitz_html/
html_sink.rs

1//! An implementation for Html5ever's sink trait, allowing us to parse HTML into a DOM.
2
3use html5ever::ParseOpts;
4use html5ever::tokenizer::TokenizerOpts;
5use html5ever::tree_builder::TreeBuilderOpts;
6use std::borrow::Cow;
7use std::cell::{Cell, Ref, RefCell, RefMut};
8
9use blitz_dom::node::Attribute;
10use blitz_dom::{DocumentMutator, HtmlParserProvider};
11use html5ever::{
12    QualName,
13    tendril::{StrTendril, TendrilSink},
14    tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink},
15};
16
17/// Convert an html5ever Attribute which uses tendril for its value to a blitz Attribute
18/// which uses String.
19fn html5ever_to_blitz_attr(attr: html5ever::Attribute) -> Attribute {
20    Attribute {
21        name: attr.name,
22        value: attr.value.to_string(),
23    }
24}
25
26#[derive(Copy, Clone, Default, Debug)]
27pub struct HtmlProvider;
28
29impl HtmlParserProvider for HtmlProvider {
30    fn parse_inner_html<'m2, 'doc2>(
31        &self,
32        mutr: &'m2 mut DocumentMutator<'doc2>,
33        element_id: usize,
34        html: &str,
35    ) {
36        DocumentHtmlParser::parse_inner_html_into_mutator(mutr, element_id, html);
37    }
38}
39
40pub struct DocumentHtmlParser<'m, 'doc> {
41    document_mutator: RefCell<&'m mut DocumentMutator<'doc>>,
42
43    /// Errors that occurred during parsing.
44    pub errors: RefCell<Vec<Cow<'static, str>>>,
45
46    /// The document's quirks mode.
47    pub quirks_mode: Cell<QuirksMode>,
48    pub is_xml: bool,
49}
50
51impl<'m, 'doc> DocumentHtmlParser<'m, 'doc> {
52    #[track_caller]
53    /// Get a mutable borrow of the DocumentMutator
54    fn mutr(&self) -> RefMut<'_, &'m mut DocumentMutator<'doc>> {
55        self.document_mutator.borrow_mut()
56    }
57}
58
59impl<'m, 'doc> DocumentHtmlParser<'m, 'doc> {
60    pub fn new(mutr: &'m mut DocumentMutator<'doc>) -> DocumentHtmlParser<'m, 'doc> {
61        DocumentHtmlParser {
62            document_mutator: RefCell::new(mutr),
63            errors: RefCell::new(Vec::new()),
64            quirks_mode: Cell::new(QuirksMode::NoQuirks),
65            is_xml: false,
66        }
67    }
68
69    pub fn parse_into_mutator<'a, 'd>(mutr: &'a mut DocumentMutator<'d>, html: &str) {
70        let mut sink = DocumentHtmlParser::new(mutr);
71
72        let is_xhtml_doc = html.starts_with("<?xml")
73            || html.starts_with("<!DOCTYPE") && {
74                let first_line = html.lines().next().unwrap();
75                first_line.contains("XHTML") || first_line.contains("xhtml")
76            };
77
78        if is_xhtml_doc {
79            // Parse as XHTML
80            sink.is_xml = true;
81            xml5ever::driver::parse_document(sink, Default::default())
82                .from_utf8()
83                .read_from(&mut html.as_bytes())
84                .unwrap();
85        } else {
86            // Parse as HTML
87            sink.is_xml = false;
88            let opts = ParseOpts {
89                tokenizer: TokenizerOpts::default(),
90                tree_builder: TreeBuilderOpts {
91                    exact_errors: false,
92                    scripting_enabled: false, // Enables parsing of <noscript> tags
93                    iframe_srcdoc: false,
94                    drop_doctype: true,
95                    quirks_mode: QuirksMode::NoQuirks,
96                },
97            };
98            html5ever::parse_document(sink, opts)
99                .from_utf8()
100                .read_from(&mut html.as_bytes())
101                .unwrap();
102        }
103    }
104
105    pub fn parse_inner_html_into_mutator<'a, 'd>(
106        mutr: &'a mut DocumentMutator<'d>,
107        element_id: usize,
108        html: &str,
109    ) {
110        let sink = DocumentHtmlParser::new(mutr);
111
112        let opts = ParseOpts {
113            tokenizer: TokenizerOpts::default(),
114            tree_builder: TreeBuilderOpts {
115                exact_errors: false,
116                scripting_enabled: false, // Enables parsing of <noscript> tags
117                iframe_srcdoc: false,
118                drop_doctype: true,
119                quirks_mode: QuirksMode::NoQuirks,
120            },
121        };
122        html5ever::driver::parse_fragment_for_element(sink, opts, element_id, false, None)
123            .from_utf8()
124            .read_from(&mut html.as_bytes())
125            .unwrap();
126
127        // html5ever creates a new fragment root node under the document node and parses the nodes into that fragment root.
128        // So here we move the children of the fragment root to element_id and then remove the fragment root
129        let fragment_root_id = mutr.last_child_id(0).unwrap();
130        let child_ids = mutr.child_ids(fragment_root_id);
131        mutr.append_children(element_id, &child_ids);
132        mutr.remove_node(fragment_root_id);
133    }
134}
135
136impl<'m, 'doc> TreeSink for DocumentHtmlParser<'m, 'doc> {
137    type Output = ();
138
139    // we use the ID of the nodes in the tree as the handle
140    type Handle = usize;
141
142    type ElemName<'a>
143        = Ref<'a, QualName>
144    where
145        Self: 'a;
146
147    fn finish(self) -> Self::Output {
148        #[cfg(feature = "tracing")]
149        for error in self.errors.borrow().iter() {
150            tracing::error!("{error}");
151        }
152    }
153
154    fn parse_error(&self, msg: Cow<'static, str>) {
155        self.errors.borrow_mut().push(msg);
156    }
157
158    fn get_document(&self) -> Self::Handle {
159        0
160    }
161
162    fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
163        Ref::map(self.document_mutator.borrow(), |docm| {
164            docm.element_name(*target)
165                .expect("TreeSink::elem_name called on a node which is not an element!")
166        })
167    }
168
169    fn create_element(
170        &self,
171        name: QualName,
172        attrs: Vec<html5ever::Attribute>,
173        _flags: ElementFlags,
174    ) -> Self::Handle {
175        let attrs = attrs.into_iter().map(html5ever_to_blitz_attr).collect();
176        self.mutr().create_element(name, attrs)
177    }
178
179    fn create_comment(&self, _text: StrTendril) -> Self::Handle {
180        self.mutr().create_comment_node()
181    }
182
183    fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
184        self.mutr().create_comment_node()
185    }
186
187    fn append(&self, parent_id: &Self::Handle, child: NodeOrText<Self::Handle>) {
188        match child {
189            NodeOrText::AppendNode(id) => self.mutr().append_children(*parent_id, &[id]),
190            // If content to append is text, first attempt to append it to the last child of parent.
191            // Else create a new text node and append it to the parent
192            NodeOrText::AppendText(text) => {
193                let last_child_id = self.mutr().last_child_id(*parent_id);
194                let has_appended = if let Some(id) = last_child_id {
195                    self.mutr().append_text_to_node(id, &text).is_ok()
196                } else {
197                    false
198                };
199                if !has_appended {
200                    let new_child_id = self.mutr().create_text_node(&text);
201                    self.mutr().append_children(*parent_id, &[new_child_id]);
202                }
203            }
204        }
205    }
206
207    // Note: The tree builder promises we won't have a text node after the insertion point.
208    // https://github.com/servo/html5ever/blob/main/rcdom/lib.rs#L338
209    fn append_before_sibling(&self, sibling_id: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
210        match new_node {
211            NodeOrText::AppendNode(id) => self.mutr().insert_nodes_before(*sibling_id, &[id]),
212            // If content to append is text, first attempt to append it to the node before sibling_node
213            // Else create a new text node and insert it before sibling_node
214            NodeOrText::AppendText(text) => {
215                let previous_sibling_id = self.mutr().previous_sibling_id(*sibling_id);
216                let has_appended = if let Some(id) = previous_sibling_id {
217                    self.mutr().append_text_to_node(id, &text).is_ok()
218                } else {
219                    false
220                };
221                if !has_appended {
222                    let new_child_id = self.mutr().create_text_node(&text);
223                    self.mutr()
224                        .insert_nodes_before(*sibling_id, &[new_child_id]);
225                }
226            }
227        };
228    }
229
230    fn append_based_on_parent_node(
231        &self,
232        element: &Self::Handle,
233        prev_element: &Self::Handle,
234        child: NodeOrText<Self::Handle>,
235    ) {
236        if self.mutr().node_has_parent(*element) {
237            self.append_before_sibling(element, child);
238        } else {
239            self.append(prev_element, child);
240        }
241    }
242
243    fn append_doctype_to_document(
244        &self,
245        _name: StrTendril,
246        _public_id: StrTendril,
247        _system_id: StrTendril,
248    ) {
249        // Ignore. We don't care about the DOCTYPE for now.
250    }
251
252    fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
253        // TODO: implement templates properly. This should allow to function like regular elements.
254        *target
255    }
256
257    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
258        x == y
259    }
260
261    fn set_quirks_mode(&self, mode: QuirksMode) {
262        self.quirks_mode.set(mode);
263    }
264
265    fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<html5ever::Attribute>) {
266        let attrs = attrs.into_iter().map(html5ever_to_blitz_attr).collect();
267        self.mutr().add_attrs_if_missing(*target, attrs);
268    }
269
270    fn remove_from_parent(&self, target: &Self::Handle) {
271        self.mutr().remove_node(*target);
272    }
273
274    fn reparent_children(&self, old_parent_id: &Self::Handle, new_parent_id: &Self::Handle) {
275        self.mutr()
276            .reparent_children(*old_parent_id, *new_parent_id);
277    }
278}
279
280#[test]
281fn parses_some_html() {
282    use blitz_dom::{BaseDocument, DocumentConfig};
283
284    let html = "<!DOCTYPE html><html><body><h1>hello world</h1></body></html>";
285    let mut doc = BaseDocument::new(DocumentConfig::default());
286    let mut mutr = doc.mutate();
287    let sink = DocumentHtmlParser::new(&mut mutr);
288
289    html5ever::parse_document(sink, Default::default())
290        .from_utf8()
291        .read_from(&mut html.as_bytes())
292        .unwrap();
293
294    drop(mutr);
295    doc.print_tree()
296
297    // Now our tree should have some nodes in it
298}