1use std::{borrow::Cow, cell::UnsafeCell, collections::HashMap};
4
5use html5ever::{
6 Attribute as H5Attribute, ExpandedName, QualName as H5QualName, local_name, ns, parse_document,
7 tendril::TendrilSink,
8 tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink},
9};
10
11use crate::dom::{Attribute, Dom, NodeData, NodeId, QualName};
12
13pub fn parse_html(html: &str) -> Dom {
15 let sink = DomTreeSink::new();
16 parse_document(sink, Default::default())
17 .from_utf8()
18 .one(html.as_bytes())
19}
20
21#[allow(unsafe_code)]
38pub struct DomTreeSink {
39 dom: UnsafeCell<Dom>,
40 quirks_mode: UnsafeCell<QuirksMode>,
41 names: UnsafeCell<HashMap<NodeId, H5QualName>>,
42}
43
44#[allow(unsafe_code)]
45impl DomTreeSink {
46 pub fn new() -> Self {
47 Self {
48 dom: UnsafeCell::new(Dom::new()),
49 quirks_mode: UnsafeCell::new(QuirksMode::NoQuirks),
50 names: UnsafeCell::new(HashMap::new()),
51 }
52 }
53
54 fn dom(&self) -> &Dom {
55 unsafe { &*self.dom.get() }
57 }
58
59 #[allow(
60 clippy::mut_from_ref,
61 reason = "single-threaded non-reentrant parser; &mut-from-&self is sound"
62 )]
63 fn dom_mut(&self) -> &mut Dom {
64 unsafe { &mut *self.dom.get() }
66 }
67
68 fn names(&self) -> &HashMap<NodeId, H5QualName> {
69 unsafe { &*self.names.get() }
71 }
72
73 #[allow(
74 clippy::mut_from_ref,
75 reason = "single-threaded non-reentrant parser; &mut-from-&self is sound"
76 )]
77 fn names_mut(&self) -> &mut HashMap<NodeId, H5QualName> {
78 unsafe { &mut *self.names.get() }
80 }
81}
82
83impl Default for DomTreeSink {
84 fn default() -> Self {
85 Self::new()
86 }
87}
88
89fn convert_qualname(name: &H5QualName) -> QualName {
90 let ns_str = name.ns.to_string();
91 let ns = if ns_str.is_empty() || ns_str == "http://www.w3.org/1999/xhtml" {
92 None
93 } else {
94 Some(ns_str)
95 };
96 QualName {
97 ns,
98 local: name.local.to_string(),
99 }
100}
101
102fn convert_attrs(attrs: Vec<H5Attribute>) -> Vec<Attribute> {
103 attrs
104 .into_iter()
105 .map(|a| {
106 let ns_str = a.name.ns.to_string();
107 Attribute {
108 name: QualName {
109 ns: if ns_str.is_empty() {
110 None
111 } else {
112 Some(ns_str)
113 },
114 local: a.name.local.to_string(),
115 },
116 value: a.value.to_string(),
117 }
118 })
119 .collect()
120}
121
122#[allow(unsafe_code)]
123impl TreeSink for DomTreeSink {
124 type Handle = NodeId;
125 type Output = Dom;
126 type ElemName<'a> = ExpandedName<'a>;
127
128 fn finish(self) -> Self::Output {
129 self.dom.into_inner()
130 }
131
132 fn parse_error(&self, _msg: Cow<'static, str>) {}
133
134 fn get_document(&self) -> NodeId {
135 NodeId::DOCUMENT
136 }
137
138 fn elem_name<'a>(&'a self, target: &'a NodeId) -> ExpandedName<'a> {
139 if let Some(qn) = self.names().get(target) {
140 ExpandedName {
141 ns: &qn.ns,
142 local: &qn.local,
143 }
144 } else {
145 static NS: html5ever::Namespace = ns!(html);
146 static LOCAL: html5ever::LocalName = local_name!("");
147 ExpandedName {
148 ns: &NS,
149 local: &LOCAL,
150 }
151 }
152 }
153
154 fn create_element(
155 &self,
156 name: H5QualName,
157 attrs: Vec<H5Attribute>,
158 _flags: ElementFlags,
159 ) -> NodeId {
160 let id = self
161 .dom_mut()
162 .create_element(convert_qualname(&name), convert_attrs(attrs));
163 self.names_mut().insert(id, name);
164 id
165 }
166
167 fn create_comment(&self, text: html5ever::tendril::StrTendril) -> NodeId {
168 self.dom_mut().create_comment(text.to_string())
169 }
170
171 fn create_pi(
172 &self,
173 target: html5ever::tendril::StrTendril,
174 data: html5ever::tendril::StrTendril,
175 ) -> NodeId {
176 self.dom_mut()
177 .allocate_pi(target.to_string(), data.to_string())
178 }
179
180 fn append(&self, parent: &NodeId, child: NodeOrText<NodeId>) {
181 let dom = self.dom_mut();
182 match child {
183 NodeOrText::AppendNode(node_id) => {
184 dom.append_child(*parent, node_id);
185 }
186 NodeOrText::AppendText(text) => {
187 if let Some(last_child) = dom.get(*parent).and_then(|n| n.last_child) {
188 if let Some(node) = dom.get_mut(last_child) {
189 if let NodeData::Text(ref mut existing) = node.data {
190 existing.push_str(&text);
191 return;
192 }
193 }
194 }
195 let text_id = dom.create_text(text.to_string());
196 dom.append_child(*parent, text_id);
197 }
198 }
199 }
200
201 fn append_based_on_parent_node(
202 &self,
203 element: &NodeId,
204 prev_element: &NodeId,
205 child: NodeOrText<NodeId>,
206 ) {
207 let has_parent = self.dom().get(*element).and_then(|n| n.parent).is_some();
208 if has_parent {
209 self.append_before_sibling(element, child);
210 } else {
211 self.append(prev_element, child);
212 }
213 }
214
215 fn append_doctype_to_document(
216 &self,
217 name: html5ever::tendril::StrTendril,
218 public_id: html5ever::tendril::StrTendril,
219 system_id: html5ever::tendril::StrTendril,
220 ) {
221 let dom = self.dom_mut();
222 let doctype = dom.create_doctype(
223 name.to_string(),
224 public_id.to_string(),
225 system_id.to_string(),
226 );
227 dom.append_child(NodeId::DOCUMENT, doctype);
228 }
229
230 fn get_template_contents(&self, target: &NodeId) -> NodeId {
231 *target
232 }
233
234 fn same_node(&self, x: &NodeId, y: &NodeId) -> bool {
235 x == y
236 }
237
238 fn set_quirks_mode(&self, mode: QuirksMode) {
239 unsafe {
241 *self.quirks_mode.get() = mode;
242 }
243 }
244
245 fn append_before_sibling(&self, sibling: &NodeId, child: NodeOrText<NodeId>) {
246 let dom = self.dom_mut();
247 let parent = match dom.get(*sibling).and_then(|n| n.parent) {
248 Some(p) => p,
249 None => return,
250 };
251 match child {
252 NodeOrText::AppendNode(node_id) => {
253 dom.insert_before(parent, node_id, *sibling);
254 }
255 NodeOrText::AppendText(text) => {
256 let text_id = dom.create_text(text.to_string());
257 dom.insert_before(parent, text_id, *sibling);
258 }
259 }
260 }
261
262 fn add_attrs_if_missing(&self, target: &NodeId, attrs: Vec<H5Attribute>) {
263 let dom = self.dom_mut();
264 if let Some(node) = dom.get_mut(*target) {
265 if let Some(elem) = node.as_element_mut() {
266 for attr in convert_attrs(attrs) {
267 if !elem.attrs.iter().any(|a| a.name == attr.name) {
268 elem.attrs.push(attr);
269 }
270 }
271 }
272 }
273 }
274
275 fn remove_from_parent(&self, target: &NodeId) {
276 self.dom_mut().detach(*target);
277 }
278
279 fn reparent_children(&self, node: &NodeId, new_parent: &NodeId) {
280 self.dom_mut().reparent_children(*node, *new_parent);
281 }
282}
283
284#[cfg(test)]
285mod tests {
286 use super::*;
287 use crate::{css_selectors::Element, dom::DomElement};
288
289 #[test]
290 fn parse_basic_html() {
291 let dom = parse_html("<html><body><h1>Hello</h1></body></html>");
292 let children = dom.children(NodeId::DOCUMENT);
293 assert!(!children.is_empty(), "Document should have children");
294 }
295
296 #[test]
297 fn parse_has_html_element() {
298 let dom = parse_html("<html><head></head><body><p>Test</p></body></html>");
299 let doc_children = dom.child_elements(NodeId::DOCUMENT);
300 assert!(!doc_children.is_empty());
301
302 let html_el = DomElement::new(&dom, doc_children[0]).unwrap();
303 assert_eq!(html_el.local_name(), "html");
304 }
305
306 #[test]
307 fn parse_text_content() {
308 let dom = parse_html("<html><body><p>Hello world</p></body></html>");
309 let html = dom.child_elements(NodeId::DOCUMENT)[0];
310 let body = dom
311 .child_elements(html)
312 .into_iter()
313 .find(|&id| {
314 dom.get(id)
315 .and_then(|n| n.as_element())
316 .is_some_and(|e| e.name.local == "body")
317 })
318 .unwrap();
319 let p = dom.child_elements(body)[0];
320 assert_eq!(dom.text_content(p), "Hello world");
321 }
322
323 #[test]
324 fn parse_attributes() {
325 let dom = parse_html("<div id=\"main\" class=\"container\">test</div>");
326 let html = dom.child_elements(NodeId::DOCUMENT)[0];
327 let body = dom
328 .child_elements(html)
329 .into_iter()
330 .find(|&id| {
331 dom.get(id)
332 .and_then(|n| n.as_element())
333 .is_some_and(|e| e.name.local == "body")
334 })
335 .unwrap();
336 let div = dom.child_elements(body)[0];
337 let el = DomElement::new(&dom, div).unwrap();
338
339 assert_eq!(el.id(), Some("main"));
340 assert!(el.has_class("container"));
341 }
342
343 #[test]
344 fn parse_nested_structure() {
345 let dom = parse_html("<html><body><div><span>a</span><span>b</span></div></body></html>");
346 let html = dom.child_elements(NodeId::DOCUMENT)[0];
347 let body = dom
348 .child_elements(html)
349 .into_iter()
350 .find(|&id| {
351 dom.get(id)
352 .and_then(|n| n.as_element())
353 .is_some_and(|e| e.name.local == "body")
354 })
355 .unwrap();
356
357 let div = dom
358 .child_elements(body)
359 .into_iter()
360 .find(|&id| {
361 dom.get(id)
362 .and_then(|n| n.as_element())
363 .is_some_and(|e| e.name.local == "div")
364 })
365 .unwrap();
366
367 let spans = dom.child_elements(div);
368 assert!(!spans.is_empty(), "expected at least 1 span");
369 assert_eq!(dom.text_content(div), "ab");
370 }
371
372 #[test]
374 fn bdd_parse_simple_html_via_parser() {
375 let dom = parse_html("<html><body><h1>Hello</h1></body></html>");
376 assert!(dom.get(NodeId::DOCUMENT).is_some());
377
378 let html = dom.child_elements(NodeId::DOCUMENT)[0];
379 let body = dom
380 .child_elements(html)
381 .into_iter()
382 .find(|&id| {
383 dom.get(id)
384 .and_then(|n| n.as_element())
385 .is_some_and(|e| e.name.local == "body")
386 })
387 .unwrap();
388 let h1 = dom
389 .child_elements(body)
390 .into_iter()
391 .find(|&id| {
392 dom.get(id)
393 .and_then(|n| n.as_element())
394 .is_some_and(|e| e.name.local == "h1")
395 })
396 .unwrap();
397
398 assert_eq!(dom.text_content(h1), "Hello");
399 }
400
401 #[test]
403 fn bdd_query_elements_via_parser() {
404 let dom = parse_html("<div class='content'><p>First</p><p>Second</p></div>");
405 let ps = dom.get_elements_by_tag_name(NodeId::DOCUMENT, "p");
406 assert_eq!(ps.len(), 2);
407 assert_eq!(dom.text_content(ps[0]), "First");
408 }
409
410 #[test]
412 fn bdd_mutate_dom_via_parser() {
413 let dom = parse_html("<div><span>Old</span></div>");
414 let html = dom.child_elements(NodeId::DOCUMENT)[0];
415 let body = dom
416 .child_elements(html)
417 .into_iter()
418 .find(|&id| {
419 dom.get(id)
420 .and_then(|n| n.as_element())
421 .is_some_and(|e| e.name.local == "body")
422 })
423 .unwrap();
424 let div = dom
425 .child_elements(body)
426 .into_iter()
427 .find(|&id| {
428 dom.get(id)
429 .and_then(|n| n.as_element())
430 .is_some_and(|e| e.name.local == "div")
431 })
432 .unwrap();
433
434 let mut dom = dom;
436 let p = dom.create_element(QualName::new("p"), vec![]);
437 let text = dom.create_text("New".to_string());
438 dom.append_child(div, p);
439 dom.append_child(p, text);
440
441 assert_eq!(dom.children(div).len(), 2);
442 }
443}