scrape_core/parser/
html5.rs1use std::collections::HashMap;
4
5use html5ever::{ParseOpts, parse_document, tendril::TendrilSink};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7
8use super::{ParseConfig, ParseError, ParseResult, Parser, private::Sealed};
9use crate::dom::{Document, NodeId};
10
11#[derive(Debug, Default, Clone, Copy)]
27pub struct Html5everParser;
28
29impl Sealed for Html5everParser {}
30
31impl Parser for Html5everParser {
32 fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document> {
33 if html.trim().is_empty() {
34 return Err(ParseError::EmptyInput);
35 }
36
37 let dom = parse_document(RcDom::default(), ParseOpts::default())
38 .from_utf8()
39 .read_from(&mut html.as_bytes())
40 .map_err(|e| ParseError::InternalError(e.to_string()))?;
41
42 convert_rcdom_to_document(&dom, config)
43 }
44}
45
46fn convert_rcdom_to_document(dom: &RcDom, config: &ParseConfig) -> ParseResult<Document> {
48 let mut document = Document::new();
49 let mut depth = 0;
50
51 convert_node(&dom.document, &mut document, None, &mut depth, config)?;
52
53 Ok(document)
54}
55
56fn convert_node(
58 handle: &Handle,
59 document: &mut Document,
60 parent: Option<NodeId>,
61 depth: &mut usize,
62 config: &ParseConfig,
63) -> ParseResult<Option<NodeId>> {
64 if *depth > config.max_depth {
65 return Err(ParseError::MaxDepthExceeded { max_depth: config.max_depth });
66 }
67 *depth = depth.saturating_add(1);
68
69 let result = match &handle.data {
70 NodeData::Document => {
71 for child in handle.children.borrow().iter() {
73 if let Some(child_id) = convert_node(child, document, None, depth, config)?
74 && document.root().is_none()
75 {
76 document.set_root(child_id);
77 }
78 }
79 *depth = depth.saturating_sub(1);
80 return Ok(None);
81 }
82
83 NodeData::Element { name, attrs, .. } => {
84 let tag_name = name.local.to_string();
86
87 let attrs_ref = attrs.borrow();
88 let mut attributes = HashMap::with_capacity(attrs_ref.len());
89 for attr in attrs_ref.iter() {
90 let key = if attr.name.ns.is_empty() {
91 attr.name.local.to_string()
92 } else {
93 format!("{}:{}", attr.name.ns, attr.name.local)
94 };
95 attributes.insert(key, attr.value.to_string());
96 }
97
98 let node_id = document.create_element(tag_name, attributes);
99
100 if let Some(parent_id) = parent {
101 document.append_child(parent_id, node_id);
102 } else if document.root().is_none() {
103 document.set_root(node_id);
104 }
105
106 for child in handle.children.borrow().iter() {
108 convert_node(child, document, Some(node_id), depth, config)?;
109 }
110
111 Some(node_id)
112 }
113
114 NodeData::Text { contents } => {
115 let text = contents.borrow().to_string();
116
117 if !config.preserve_whitespace && text.trim().is_empty() {
119 *depth = depth.saturating_sub(1);
120 return Ok(None);
121 }
122
123 let node_id = document.create_text(text);
124
125 if let Some(parent_id) = parent {
126 document.append_child(parent_id, node_id);
127 }
128
129 Some(node_id)
130 }
131
132 NodeData::Comment { contents } => {
133 if !config.include_comments {
134 *depth = depth.saturating_sub(1);
135 return Ok(None);
136 }
137
138 let node_id = document.create_comment(contents.to_string());
139
140 if let Some(parent_id) = parent {
141 document.append_child(parent_id, node_id);
142 }
143
144 Some(node_id)
145 }
146
147 NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {
148 *depth = depth.saturating_sub(1);
150 return Ok(None);
151 }
152 };
153
154 *depth = depth.saturating_sub(1);
155 Ok(result)
156}