Skip to main content

hpx_browser/
html_parser.rs

1//! html5ever integration — parse HTML strings into a [`Dom`].
2
3use std::{borrow::Cow, cell::UnsafeCell, collections::HashMap};
4
5use html5ever::{
6    Attribute as H5Attribute, ExpandedName, QualName as H5QualName, local_name, ns, parse_document,
7    tendril::TendrilSink,
8    tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink},
9};
10
11use crate::dom::{Attribute, Dom, NodeData, NodeId, QualName};
12
13/// Parse an HTML document string into a DOM tree.
14pub fn parse_html(html: &str) -> Dom {
15    let sink = DomTreeSink::new();
16    parse_document(sink, Default::default())
17        .from_utf8()
18        .one(html.as_bytes())
19}
20
21// ---------------------------------------------------------------------------
22// DomTreeSink — bridges html5ever's TreeSink to our Dom
23// ---------------------------------------------------------------------------
24
25/// `TreeSink` implementation that builds a DOM.
26///
27/// Uses `UnsafeCell` because html5ever's `TreeSink` trait takes `&self`
28/// but tree building inherently requires mutation.
29///
30/// # Safety
31///
32/// `DomTreeSink` is **not `Sync`**. It is owned by a single parsing
33/// thread for the lifetime of the parse. html5ever calls `TreeSink`
34/// methods serially from that thread — never concurrently, never
35/// reentrantly. References handed out by helpers are dropped before
36/// the next callback runs.
37#[allow(unsafe_code)]
38pub struct DomTreeSink {
39    dom: UnsafeCell<Dom>,
40    quirks_mode: UnsafeCell<QuirksMode>,
41    names: UnsafeCell<HashMap<NodeId, H5QualName>>,
42}
43
44#[allow(unsafe_code)]
45impl DomTreeSink {
46    pub fn new() -> Self {
47        Self {
48            dom: UnsafeCell::new(Dom::new()),
49            quirks_mode: UnsafeCell::new(QuirksMode::NoQuirks),
50            names: UnsafeCell::new(HashMap::new()),
51        }
52    }
53
54    fn dom(&self) -> &Dom {
55        // SAFETY: single-threaded parser, no concurrent or reentrant access.
56        unsafe { &*self.dom.get() }
57    }
58
59    #[allow(
60        clippy::mut_from_ref,
61        reason = "single-threaded non-reentrant parser; &mut-from-&self is sound"
62    )]
63    fn dom_mut(&self) -> &mut Dom {
64        // SAFETY: single-threaded parser, no concurrent or reentrant access.
65        unsafe { &mut *self.dom.get() }
66    }
67
68    fn names(&self) -> &HashMap<NodeId, H5QualName> {
69        // SAFETY: single-threaded parser.
70        unsafe { &*self.names.get() }
71    }
72
73    #[allow(
74        clippy::mut_from_ref,
75        reason = "single-threaded non-reentrant parser; &mut-from-&self is sound"
76    )]
77    fn names_mut(&self) -> &mut HashMap<NodeId, H5QualName> {
78        // SAFETY: single-threaded parser.
79        unsafe { &mut *self.names.get() }
80    }
81}
82
83impl Default for DomTreeSink {
84    fn default() -> Self {
85        Self::new()
86    }
87}
88
89fn convert_qualname(name: &H5QualName) -> QualName {
90    let ns_str = name.ns.to_string();
91    let ns = if ns_str.is_empty() || ns_str == "http://www.w3.org/1999/xhtml" {
92        None
93    } else {
94        Some(ns_str)
95    };
96    QualName {
97        ns,
98        local: name.local.to_string(),
99    }
100}
101
102fn convert_attrs(attrs: Vec<H5Attribute>) -> Vec<Attribute> {
103    attrs
104        .into_iter()
105        .map(|a| {
106            let ns_str = a.name.ns.to_string();
107            Attribute {
108                name: QualName {
109                    ns: if ns_str.is_empty() {
110                        None
111                    } else {
112                        Some(ns_str)
113                    },
114                    local: a.name.local.to_string(),
115                },
116                value: a.value.to_string(),
117            }
118        })
119        .collect()
120}
121
122#[allow(unsafe_code)]
123impl TreeSink for DomTreeSink {
124    type Handle = NodeId;
125    type Output = Dom;
126    type ElemName<'a> = ExpandedName<'a>;
127
128    fn finish(self) -> Self::Output {
129        self.dom.into_inner()
130    }
131
132    fn parse_error(&self, _msg: Cow<'static, str>) {}
133
134    fn get_document(&self) -> NodeId {
135        NodeId::DOCUMENT
136    }
137
138    fn elem_name<'a>(&'a self, target: &'a NodeId) -> ExpandedName<'a> {
139        if let Some(qn) = self.names().get(target) {
140            ExpandedName {
141                ns: &qn.ns,
142                local: &qn.local,
143            }
144        } else {
145            static NS: html5ever::Namespace = ns!(html);
146            static LOCAL: html5ever::LocalName = local_name!("");
147            ExpandedName {
148                ns: &NS,
149                local: &LOCAL,
150            }
151        }
152    }
153
154    fn create_element(
155        &self,
156        name: H5QualName,
157        attrs: Vec<H5Attribute>,
158        _flags: ElementFlags,
159    ) -> NodeId {
160        let id = self
161            .dom_mut()
162            .create_element(convert_qualname(&name), convert_attrs(attrs));
163        self.names_mut().insert(id, name);
164        id
165    }
166
167    fn create_comment(&self, text: html5ever::tendril::StrTendril) -> NodeId {
168        self.dom_mut().create_comment(text.to_string())
169    }
170
171    fn create_pi(
172        &self,
173        target: html5ever::tendril::StrTendril,
174        data: html5ever::tendril::StrTendril,
175    ) -> NodeId {
176        self.dom_mut()
177            .allocate_pi(target.to_string(), data.to_string())
178    }
179
180    fn append(&self, parent: &NodeId, child: NodeOrText<NodeId>) {
181        let dom = self.dom_mut();
182        match child {
183            NodeOrText::AppendNode(node_id) => {
184                dom.append_child(*parent, node_id);
185            }
186            NodeOrText::AppendText(text) => {
187                if let Some(last_child) = dom.get(*parent).and_then(|n| n.last_child) {
188                    if let Some(node) = dom.get_mut(last_child) {
189                        if let NodeData::Text(ref mut existing) = node.data {
190                            existing.push_str(&text);
191                            return;
192                        }
193                    }
194                }
195                let text_id = dom.create_text(text.to_string());
196                dom.append_child(*parent, text_id);
197            }
198        }
199    }
200
201    fn append_based_on_parent_node(
202        &self,
203        element: &NodeId,
204        prev_element: &NodeId,
205        child: NodeOrText<NodeId>,
206    ) {
207        let has_parent = self.dom().get(*element).and_then(|n| n.parent).is_some();
208        if has_parent {
209            self.append_before_sibling(element, child);
210        } else {
211            self.append(prev_element, child);
212        }
213    }
214
215    fn append_doctype_to_document(
216        &self,
217        name: html5ever::tendril::StrTendril,
218        public_id: html5ever::tendril::StrTendril,
219        system_id: html5ever::tendril::StrTendril,
220    ) {
221        let dom = self.dom_mut();
222        let doctype = dom.create_doctype(
223            name.to_string(),
224            public_id.to_string(),
225            system_id.to_string(),
226        );
227        dom.append_child(NodeId::DOCUMENT, doctype);
228    }
229
230    fn get_template_contents(&self, target: &NodeId) -> NodeId {
231        *target
232    }
233
234    fn same_node(&self, x: &NodeId, y: &NodeId) -> bool {
235        x == y
236    }
237
238    fn set_quirks_mode(&self, mode: QuirksMode) {
239        // SAFETY: single-threaded parser, no concurrent access.
240        unsafe {
241            *self.quirks_mode.get() = mode;
242        }
243    }
244
245    fn append_before_sibling(&self, sibling: &NodeId, child: NodeOrText<NodeId>) {
246        let dom = self.dom_mut();
247        let parent = match dom.get(*sibling).and_then(|n| n.parent) {
248            Some(p) => p,
249            None => return,
250        };
251        match child {
252            NodeOrText::AppendNode(node_id) => {
253                dom.insert_before(parent, node_id, *sibling);
254            }
255            NodeOrText::AppendText(text) => {
256                let text_id = dom.create_text(text.to_string());
257                dom.insert_before(parent, text_id, *sibling);
258            }
259        }
260    }
261
262    fn add_attrs_if_missing(&self, target: &NodeId, attrs: Vec<H5Attribute>) {
263        let dom = self.dom_mut();
264        if let Some(node) = dom.get_mut(*target) {
265            if let Some(elem) = node.as_element_mut() {
266                for attr in convert_attrs(attrs) {
267                    if !elem.attrs.iter().any(|a| a.name == attr.name) {
268                        elem.attrs.push(attr);
269                    }
270                }
271            }
272        }
273    }
274
275    fn remove_from_parent(&self, target: &NodeId) {
276        self.dom_mut().detach(*target);
277    }
278
279    fn reparent_children(&self, node: &NodeId, new_parent: &NodeId) {
280        self.dom_mut().reparent_children(*node, *new_parent);
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287    use crate::{css_selectors::Element, dom::DomElement};
288
289    #[test]
290    fn parse_basic_html() {
291        let dom = parse_html("<html><body><h1>Hello</h1></body></html>");
292        let children = dom.children(NodeId::DOCUMENT);
293        assert!(!children.is_empty(), "Document should have children");
294    }
295
296    #[test]
297    fn parse_has_html_element() {
298        let dom = parse_html("<html><head></head><body><p>Test</p></body></html>");
299        let doc_children = dom.child_elements(NodeId::DOCUMENT);
300        assert!(!doc_children.is_empty());
301
302        let html_el = DomElement::new(&dom, doc_children[0]).unwrap();
303        assert_eq!(html_el.local_name(), "html");
304    }
305
306    #[test]
307    fn parse_text_content() {
308        let dom = parse_html("<html><body><p>Hello world</p></body></html>");
309        let html = dom.child_elements(NodeId::DOCUMENT)[0];
310        let body = dom
311            .child_elements(html)
312            .into_iter()
313            .find(|&id| {
314                dom.get(id)
315                    .and_then(|n| n.as_element())
316                    .is_some_and(|e| e.name.local == "body")
317            })
318            .unwrap();
319        let p = dom.child_elements(body)[0];
320        assert_eq!(dom.text_content(p), "Hello world");
321    }
322
323    #[test]
324    fn parse_attributes() {
325        let dom = parse_html("<div id=\"main\" class=\"container\">test</div>");
326        let html = dom.child_elements(NodeId::DOCUMENT)[0];
327        let body = dom
328            .child_elements(html)
329            .into_iter()
330            .find(|&id| {
331                dom.get(id)
332                    .and_then(|n| n.as_element())
333                    .is_some_and(|e| e.name.local == "body")
334            })
335            .unwrap();
336        let div = dom.child_elements(body)[0];
337        let el = DomElement::new(&dom, div).unwrap();
338
339        assert_eq!(el.id(), Some("main"));
340        assert!(el.has_class("container"));
341    }
342
343    #[test]
344    fn parse_nested_structure() {
345        let dom = parse_html("<html><body><div><span>a</span><span>b</span></div></body></html>");
346        let html = dom.child_elements(NodeId::DOCUMENT)[0];
347        let body = dom
348            .child_elements(html)
349            .into_iter()
350            .find(|&id| {
351                dom.get(id)
352                    .and_then(|n| n.as_element())
353                    .is_some_and(|e| e.name.local == "body")
354            })
355            .unwrap();
356
357        let div = dom
358            .child_elements(body)
359            .into_iter()
360            .find(|&id| {
361                dom.get(id)
362                    .and_then(|n| n.as_element())
363                    .is_some_and(|e| e.name.local == "div")
364            })
365            .unwrap();
366
367        let spans = dom.child_elements(div);
368        assert!(!spans.is_empty(), "expected at least 1 span");
369        assert_eq!(dom.text_content(div), "ab");
370    }
371
372    // BDD: "Parse simple HTML" scenario via html5ever
373    #[test]
374    fn bdd_parse_simple_html_via_parser() {
375        let dom = parse_html("<html><body><h1>Hello</h1></body></html>");
376        assert!(dom.get(NodeId::DOCUMENT).is_some());
377
378        let html = dom.child_elements(NodeId::DOCUMENT)[0];
379        let body = dom
380            .child_elements(html)
381            .into_iter()
382            .find(|&id| {
383                dom.get(id)
384                    .and_then(|n| n.as_element())
385                    .is_some_and(|e| e.name.local == "body")
386            })
387            .unwrap();
388        let h1 = dom
389            .child_elements(body)
390            .into_iter()
391            .find(|&id| {
392                dom.get(id)
393                    .and_then(|n| n.as_element())
394                    .is_some_and(|e| e.name.local == "h1")
395            })
396            .unwrap();
397
398        assert_eq!(dom.text_content(h1), "Hello");
399    }
400
401    // BDD: "Query elements by selector" scenario via html5ever
402    #[test]
403    fn bdd_query_elements_via_parser() {
404        let dom = parse_html("<div class='content'><p>First</p><p>Second</p></div>");
405        let ps = dom.get_elements_by_tag_name(NodeId::DOCUMENT, "p");
406        assert_eq!(ps.len(), 2);
407        assert_eq!(dom.text_content(ps[0]), "First");
408    }
409
410    // BDD: "Mutate DOM tree" scenario via html5ever
411    #[test]
412    fn bdd_mutate_dom_via_parser() {
413        let dom = parse_html("<div><span>Old</span></div>");
414        let html = dom.child_elements(NodeId::DOCUMENT)[0];
415        let body = dom
416            .child_elements(html)
417            .into_iter()
418            .find(|&id| {
419                dom.get(id)
420                    .and_then(|n| n.as_element())
421                    .is_some_and(|e| e.name.local == "body")
422            })
423            .unwrap();
424        let div = dom
425            .child_elements(body)
426            .into_iter()
427            .find(|&id| {
428                dom.get(id)
429                    .and_then(|n| n.as_element())
430                    .is_some_and(|e| e.name.local == "div")
431            })
432            .unwrap();
433
434        // Mutate: need a mutable Dom
435        let mut dom = dom;
436        let p = dom.create_element(QualName::new("p"), vec![]);
437        let text = dom.create_text("New".to_string());
438        dom.append_child(div, p);
439        dom.append_child(p, text);
440
441        assert_eq!(dom.children(div).len(), 2);
442    }
443}