scrape_core/parser/
html5.rs1use std::collections::HashMap;
4
5use html5ever::{ParseOpts, parse_document, tendril::TendrilSink};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7
8use super::{ParseConfig, ParseError, ParseResult, Parser, private::Sealed};
9use crate::dom::{Document, DocumentIndex, NodeId};
10
11#[derive(Debug, Default, Clone, Copy)]
27pub struct Html5everParser;
28
29impl Sealed for Html5everParser {}
30
31impl Parser for Html5everParser {
32 fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document> {
33 self.parse_with_config_and_capacity(html, config, 256)
34 }
35}
36
37impl Html5everParser {
38 pub fn parse_with_config_and_capacity(
44 &self,
45 html: &str,
46 config: &ParseConfig,
47 capacity: usize,
48 ) -> ParseResult<Document> {
49 if html.trim().is_empty() {
50 return Err(ParseError::EmptyInput);
51 }
52
53 let dom = parse_document(RcDom::default(), ParseOpts::default())
54 .from_utf8()
55 .read_from(&mut html.as_bytes())
56 .map_err(|e| ParseError::InternalError(e.to_string()))?;
57
58 convert_rcdom_to_document_with_capacity(&dom, config, capacity)
59 }
60}
61
62fn convert_rcdom_to_document(dom: &RcDom, config: &ParseConfig) -> ParseResult<Document> {
64 convert_rcdom_to_document_with_capacity(dom, config, 256)
65}
66
67fn convert_rcdom_to_document_with_capacity(
69 dom: &RcDom,
70 config: &ParseConfig,
71 capacity: usize,
72) -> ParseResult<Document> {
73 let mut document = crate::dom::DocumentImpl::<crate::dom::Building>::with_capacity(capacity);
74 let mut depth = 0;
75 let mut index = DocumentIndex::new();
76
77 convert_node(&dom.document, &mut document, None, &mut depth, config, &mut index)?;
78
79 let mut document = document.build();
80 document.set_index(index);
81 Ok(document)
82}
83
84fn convert_node(
86 handle: &Handle,
87 document: &mut crate::dom::DocumentImpl<crate::dom::Building>,
88 parent: Option<NodeId>,
89 depth: &mut usize,
90 config: &ParseConfig,
91 index: &mut DocumentIndex,
92) -> ParseResult<Option<NodeId>> {
93 if *depth > config.max_depth {
94 return Err(ParseError::MaxDepthExceeded { max_depth: config.max_depth });
95 }
96 *depth = depth.saturating_add(1);
97
98 let result = match &handle.data {
99 NodeData::Document => {
100 for child in handle.children.borrow().iter() {
102 if let Some(child_id) = convert_node(child, document, None, depth, config, index)?
103 && document.root().is_none()
104 {
105 document.set_root(child_id);
106 }
107 }
108 *depth = depth.saturating_sub(1);
109 return Ok(None);
110 }
111
112 NodeData::Element { name, attrs, .. } => {
113 let tag_name = name.local.to_string();
115
116 let attrs_ref = attrs.borrow();
117 let mut attributes = HashMap::with_capacity(attrs_ref.len());
118 for attr in attrs_ref.iter() {
119 let key = if attr.name.ns.is_empty() {
120 attr.name.local.to_string()
121 } else {
122 format!("{}:{}", attr.name.ns, attr.name.local)
123 };
124 attributes.insert(key, attr.value.to_string());
125 }
126
127 let node_id = document.create_element(tag_name, attributes.clone());
128
129 if let Some(id_attr) = attributes.get("id") {
130 index.register_id(id_attr.clone(), node_id);
131 }
132 if let Some(class_attr) = attributes.get("class") {
133 index.register_classes(class_attr, node_id);
134 }
135
136 if let Some(parent_id) = parent {
137 document.append_child(parent_id, node_id);
138 } else if document.root().is_none() {
139 document.set_root(node_id);
140 }
141
142 for child in handle.children.borrow().iter() {
144 convert_node(child, document, Some(node_id), depth, config, index)?;
145 }
146
147 Some(node_id)
148 }
149
150 NodeData::Text { contents } => {
151 let text = contents.borrow().to_string();
152
153 if !config.preserve_whitespace && text.trim().is_empty() {
155 *depth = depth.saturating_sub(1);
156 return Ok(None);
157 }
158
159 let node_id = document.create_text(text);
160
161 if let Some(parent_id) = parent {
162 document.append_child(parent_id, node_id);
163 }
164
165 Some(node_id)
166 }
167
168 NodeData::Comment { contents } => {
169 if !config.include_comments {
170 *depth = depth.saturating_sub(1);
171 return Ok(None);
172 }
173
174 let node_id = document.create_comment(contents.to_string());
175
176 if let Some(parent_id) = parent {
177 document.append_child(parent_id, node_id);
178 }
179
180 Some(node_id)
181 }
182
183 NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {
184 *depth = depth.saturating_sub(1);
186 return Ok(None);
187 }
188 };
189
190 *depth = depth.saturating_sub(1);
191 Ok(result)
192}