blitz_html/
html_sink.rs

1//! An implementation for Html5ever's sink trait, allowing us to parse HTML into a DOM.
2
3use html5ever::ParseOpts;
4use html5ever::tokenizer::TokenizerOpts;
5use html5ever::tree_builder::TreeBuilderOpts;
6use std::borrow::Cow;
7use std::cell::{Cell, Ref, RefCell, RefMut};
8
9use blitz_dom::node::Attribute;
10use blitz_dom::{DocumentMutator, HtmlParserProvider};
11use html5ever::{
12    QualName,
13    tendril::{StrTendril, TendrilSink},
14    tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink},
15};
16
17/// Convert an html5ever Attribute which uses tendril for its value to a blitz Attribute
18/// which uses String.
19fn html5ever_to_blitz_attr(attr: html5ever::Attribute) -> Attribute {
20    Attribute {
21        name: attr.name,
22        value: attr.value.to_string(),
23    }
24}
25
26#[derive(Copy, Clone, Default, Debug)]
27pub struct HtmlProvider;
28
29impl HtmlParserProvider for HtmlProvider {
30    fn parse_inner_html<'m2, 'doc2>(
31        &self,
32        mutr: &'m2 mut DocumentMutator<'doc2>,
33        element_id: usize,
34        html: &str,
35    ) {
36        DocumentHtmlParser::parse_inner_html_into_mutator(mutr, element_id, html);
37    }
38}
39
40pub struct DocumentHtmlParser<'m, 'doc> {
41    document_mutator: RefCell<&'m mut DocumentMutator<'doc>>,
42
43    /// Errors that occurred during parsing.
44    pub errors: RefCell<Vec<Cow<'static, str>>>,
45
46    /// The document's quirks mode.
47    pub quirks_mode: Cell<QuirksMode>,
48    pub is_xml: bool,
49}
50
51impl<'m, 'doc> DocumentHtmlParser<'m, 'doc> {
52    #[track_caller]
53    /// Get a mutable borrow of the DocumentMutator
54    fn mutr(&self) -> RefMut<'_, &'m mut DocumentMutator<'doc>> {
55        self.document_mutator.borrow_mut()
56    }
57}
58
59impl<'m, 'doc> DocumentHtmlParser<'m, 'doc> {
60    pub fn new(mutr: &'m mut DocumentMutator<'doc>) -> DocumentHtmlParser<'m, 'doc> {
61        DocumentHtmlParser {
62            document_mutator: RefCell::new(mutr),
63            errors: RefCell::new(Vec::new()),
64            quirks_mode: Cell::new(QuirksMode::NoQuirks),
65            is_xml: false,
66        }
67    }
68
69    pub fn parse_into_mutator<'a, 'd>(mutr: &'a mut DocumentMutator<'d>, html: &str) {
70        let mut sink = DocumentHtmlParser::new(mutr);
71
72        let is_xhtml_doc = html.starts_with("<?xml")
73            || html.starts_with("<!DOCTYPE") && {
74                let first_line = html.lines().next().unwrap();
75                first_line.contains("XHTML") || first_line.contains("xhtml")
76            };
77
78        if is_xhtml_doc {
79            // Parse as XHTML
80            sink.is_xml = true;
81            xml5ever::driver::parse_document(sink, Default::default())
82                .from_utf8()
83                .read_from(&mut html.as_bytes())
84                .unwrap();
85        } else {
86            // Parse as HTML
87            sink.is_xml = false;
88            let opts = ParseOpts {
89                tokenizer: TokenizerOpts::default(),
90                tree_builder: TreeBuilderOpts {
91                    exact_errors: false,
92                    scripting_enabled: false, // Enables parsing of <noscript> tags
93                    iframe_srcdoc: false,
94                    drop_doctype: true,
95                    quirks_mode: QuirksMode::NoQuirks,
96                },
97            };
98            html5ever::parse_document(sink, opts)
99                .from_utf8()
100                .read_from(&mut html.as_bytes())
101                .unwrap();
102        }
103    }
104
105    pub fn parse_inner_html_into_mutator<'a, 'd>(
106        mutr: &'a mut DocumentMutator<'d>,
107        element_id: usize,
108        html: &str,
109    ) {
110        let sink = DocumentHtmlParser::new(mutr);
111
112        let opts = ParseOpts {
113            tokenizer: TokenizerOpts::default(),
114            tree_builder: TreeBuilderOpts {
115                exact_errors: false,
116                scripting_enabled: false, // Enables parsing of <noscript> tags
117                iframe_srcdoc: false,
118                drop_doctype: true,
119                quirks_mode: QuirksMode::NoQuirks,
120            },
121        };
122        html5ever::driver::parse_fragment_for_element(sink, opts, element_id, false, None)
123            .from_utf8()
124            .read_from(&mut html.as_bytes())
125            .unwrap();
126
127        // html5ever creates a new fragment root node under the document node and parses the nodes into that fragment root.
128        // So here we move the children of the fragment root to element_id and then remove the fragment root
129        let fragment_root_id = mutr.last_child_id(0).unwrap();
130        let child_ids = mutr.child_ids(fragment_root_id);
131        mutr.append_children(element_id, &child_ids);
132        mutr.remove_node(fragment_root_id);
133    }
134}
135
136impl<'m, 'doc> TreeSink for DocumentHtmlParser<'m, 'doc> {
137    type Output = ();
138
139    // we use the ID of the nodes in the tree as the handle
140    type Handle = usize;
141
142    type ElemName<'a>
143        = Ref<'a, QualName>
144    where
145        Self: 'a;
146
147    fn finish(self) -> Self::Output {
148        for error in self.errors.borrow().iter() {
149            println!("ERROR: {error}");
150        }
151    }
152
153    fn parse_error(&self, msg: Cow<'static, str>) {
154        self.errors.borrow_mut().push(msg);
155    }
156
157    fn get_document(&self) -> Self::Handle {
158        0
159    }
160
161    fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> Self::ElemName<'a> {
162        Ref::map(self.document_mutator.borrow(), |docm| {
163            docm.element_name(*target)
164                .expect("TreeSink::elem_name called on a node which is not an element!")
165        })
166    }
167
168    fn create_element(
169        &self,
170        name: QualName,
171        attrs: Vec<html5ever::Attribute>,
172        _flags: ElementFlags,
173    ) -> Self::Handle {
174        let attrs = attrs.into_iter().map(html5ever_to_blitz_attr).collect();
175        self.mutr().create_element(name, attrs)
176    }
177
178    fn create_comment(&self, _text: StrTendril) -> Self::Handle {
179        self.mutr().create_comment_node()
180    }
181
182    fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
183        self.mutr().create_comment_node()
184    }
185
186    fn append(&self, parent_id: &Self::Handle, child: NodeOrText<Self::Handle>) {
187        match child {
188            NodeOrText::AppendNode(id) => self.mutr().append_children(*parent_id, &[id]),
189            // If content to append is text, first attempt to append it to the last child of parent.
190            // Else create a new text node and append it to the parent
191            NodeOrText::AppendText(text) => {
192                let last_child_id = self.mutr().last_child_id(*parent_id);
193                let has_appended = if let Some(id) = last_child_id {
194                    self.mutr().append_text_to_node(id, &text).is_ok()
195                } else {
196                    false
197                };
198                if !has_appended {
199                    let new_child_id = self.mutr().create_text_node(&text);
200                    self.mutr().append_children(*parent_id, &[new_child_id]);
201                }
202            }
203        }
204    }
205
206    // Note: The tree builder promises we won't have a text node after the insertion point.
207    // https://github.com/servo/html5ever/blob/main/rcdom/lib.rs#L338
208    fn append_before_sibling(&self, sibling_id: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
209        match new_node {
210            NodeOrText::AppendNode(id) => self.mutr().insert_nodes_before(*sibling_id, &[id]),
211            // If content to append is text, first attempt to append it to the node before sibling_node
212            // Else create a new text node and insert it before sibling_node
213            NodeOrText::AppendText(text) => {
214                let previous_sibling_id = self.mutr().previous_sibling_id(*sibling_id);
215                let has_appended = if let Some(id) = previous_sibling_id {
216                    self.mutr().append_text_to_node(id, &text).is_ok()
217                } else {
218                    false
219                };
220                if !has_appended {
221                    let new_child_id = self.mutr().create_text_node(&text);
222                    self.mutr()
223                        .insert_nodes_before(*sibling_id, &[new_child_id]);
224                }
225            }
226        };
227    }
228
229    fn append_based_on_parent_node(
230        &self,
231        element: &Self::Handle,
232        prev_element: &Self::Handle,
233        child: NodeOrText<Self::Handle>,
234    ) {
235        if self.mutr().node_has_parent(*element) {
236            self.append_before_sibling(element, child);
237        } else {
238            self.append(prev_element, child);
239        }
240    }
241
242    fn append_doctype_to_document(
243        &self,
244        _name: StrTendril,
245        _public_id: StrTendril,
246        _system_id: StrTendril,
247    ) {
248        // Ignore. We don't care about the DOCTYPE for now.
249    }
250
251    fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
252        // TODO: implement templates properly. This should allow to function like regular elements.
253        *target
254    }
255
256    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
257        x == y
258    }
259
260    fn set_quirks_mode(&self, mode: QuirksMode) {
261        self.quirks_mode.set(mode);
262    }
263
264    fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<html5ever::Attribute>) {
265        let attrs = attrs.into_iter().map(html5ever_to_blitz_attr).collect();
266        self.mutr().add_attrs_if_missing(*target, attrs);
267    }
268
269    fn remove_from_parent(&self, target: &Self::Handle) {
270        self.mutr().remove_node(*target);
271    }
272
273    fn reparent_children(&self, old_parent_id: &Self::Handle, new_parent_id: &Self::Handle) {
274        self.mutr()
275            .reparent_children(*old_parent_id, *new_parent_id);
276    }
277}
278
279#[test]
280fn parses_some_html() {
281    use blitz_dom::{BaseDocument, DocumentConfig};
282
283    let html = "<!DOCTYPE html><html><body><h1>hello world</h1></body></html>";
284    let mut doc = BaseDocument::new(DocumentConfig::default());
285    let mut mutr = doc.mutate();
286    let sink = DocumentHtmlParser::new(&mut mutr);
287
288    html5ever::parse_document(sink, Default::default())
289        .from_utf8()
290        .read_from(&mut html.as_bytes())
291        .unwrap();
292
293    drop(mutr);
294    doc.print_tree()
295
296    // Now our tree should have some nodes in it
297}